xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h (revision cdea38f91afcae93cc2a552cd96c41d8d3ab2ad6)
1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the declarations of the Vectorization Plan base classes:
11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
12 ///    VPBlockBase, together implementing a Hierarchical CFG;
13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained
14 ///    within VPBasicBlocks;
15 /// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that
16 ///    also inherit from VPValue.
17 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
18 ///    instruction;
19 /// 5. The VPlan class holding a candidate for vectorization;
20 /// 6. The VPlanPrinter class providing a way to print a plan in dot format;
21 /// These are documented in docs/VectorizationPlan.rst.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
26 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
27 
28 #include "VPlanAnalysis.h"
29 #include "VPlanValue.h"
30 #include "llvm/ADT/DenseMap.h"
31 #include "llvm/ADT/SmallBitVector.h"
32 #include "llvm/ADT/SmallPtrSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Twine.h"
35 #include "llvm/ADT/ilist.h"
36 #include "llvm/ADT/ilist_node.h"
37 #include "llvm/Analysis/DomTreeUpdater.h"
38 #include "llvm/Analysis/IVDescriptors.h"
39 #include "llvm/Analysis/LoopInfo.h"
40 #include "llvm/Analysis/TargetTransformInfo.h"
41 #include "llvm/Analysis/VectorUtils.h"
42 #include "llvm/IR/DebugLoc.h"
43 #include "llvm/IR/FMF.h"
44 #include "llvm/IR/Operator.h"
45 #include "llvm/Support/InstructionCost.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstddef>
49 #include <string>
50 
51 namespace llvm {
52 
53 class BasicBlock;
54 class DominatorTree;
55 class InnerLoopVectorizer;
56 class IRBuilderBase;
57 class LoopInfo;
58 class raw_ostream;
59 class RecurrenceDescriptor;
60 class SCEV;
61 class Type;
62 class VPBasicBlock;
63 class VPBuilder;
64 class VPRegionBlock;
65 class VPlan;
66 class VPReplicateRecipe;
67 class VPlanSlp;
68 class Value;
69 class LoopVectorizationCostModel;
70 class LoopVersioning;
71 
72 struct VPCostContext;
73 
74 namespace Intrinsic {
75 typedef unsigned ID;
76 }
77 
78 /// Returns a calculation for the total number of elements for a given \p VF.
79 /// For fixed width vectors this value is a constant, whereas for scalable
80 /// vectors it is an expression determined at runtime.
81 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
82 
83 /// Return a value for Step multiplied by VF.
84 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
85                        int64_t Step);
86 
87 /// A helper function that returns the reciprocal of the block probability of
88 /// predicated blocks. If we return X, we are assuming the predicated block
89 /// will execute once for every X iterations of the loop header.
90 ///
91 /// TODO: We should use actual block probability here, if available. Currently,
92 ///       we always assume predicated blocks have a 50% chance of executing.
93 inline unsigned getReciprocalPredBlockProb() { return 2; }
94 
95 /// A range of powers-of-2 vectorization factors with fixed start and
96 /// adjustable end. The range includes start and excludes end, e.g.,:
97 /// [1, 16) = {1, 2, 4, 8}
98 struct VFRange {
99   // A power of 2.
100   const ElementCount Start;
101 
102   // A power of 2. If End <= Start range is empty.
103   ElementCount End;
104 
105   bool isEmpty() const {
106     return End.getKnownMinValue() <= Start.getKnownMinValue();
107   }
108 
109   VFRange(const ElementCount &Start, const ElementCount &End)
110       : Start(Start), End(End) {
111     assert(Start.isScalable() == End.isScalable() &&
112            "Both Start and End should have the same scalable flag");
113     assert(isPowerOf2_32(Start.getKnownMinValue()) &&
114            "Expected Start to be a power of 2");
115     assert(isPowerOf2_32(End.getKnownMinValue()) &&
116            "Expected End to be a power of 2");
117   }
118 
119   /// Iterator to iterate over vectorization factors in a VFRange.
120   class iterator
121       : public iterator_facade_base<iterator, std::forward_iterator_tag,
122                                     ElementCount> {
123     ElementCount VF;
124 
125   public:
126     iterator(ElementCount VF) : VF(VF) {}
127 
128     bool operator==(const iterator &Other) const { return VF == Other.VF; }
129 
130     ElementCount operator*() const { return VF; }
131 
132     iterator &operator++() {
133       VF *= 2;
134       return *this;
135     }
136   };
137 
138   iterator begin() { return iterator(Start); }
139   iterator end() {
140     assert(isPowerOf2_32(End.getKnownMinValue()));
141     return iterator(End);
142   }
143 };
144 
145 using VPlanPtr = std::unique_ptr<VPlan>;
146 
147 /// In what follows, the term "input IR" refers to code that is fed into the
148 /// vectorizer whereas the term "output IR" refers to code that is generated by
149 /// the vectorizer.
150 
151 /// VPLane provides a way to access lanes in both fixed width and scalable
152 /// vectors, where for the latter the lane index sometimes needs calculating
153 /// as a runtime expression.
154 class VPLane {
155 public:
156   /// Kind describes how to interpret Lane.
157   enum class Kind : uint8_t {
158     /// For First, Lane is the index into the first N elements of a
159     /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
160     First,
161     /// For ScalableLast, Lane is the offset from the start of the last
162     /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
163     /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
164     /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
165     ScalableLast
166   };
167 
168 private:
169   /// in [0..VF)
170   unsigned Lane;
171 
172   /// Indicates how the Lane should be interpreted, as described above.
173   Kind LaneKind;
174 
175 public:
176   VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {}
177   VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
178 
179   static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
180 
181   static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
182     assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
183            "trying to extract with invalid offset");
184     unsigned LaneOffset = VF.getKnownMinValue() - Offset;
185     Kind LaneKind;
186     if (VF.isScalable())
187       // In this case 'LaneOffset' refers to the offset from the start of the
188       // last subvector with VF.getKnownMinValue() elements.
189       LaneKind = VPLane::Kind::ScalableLast;
190     else
191       LaneKind = VPLane::Kind::First;
192     return VPLane(LaneOffset, LaneKind);
193   }
194 
195   static VPLane getLastLaneForVF(const ElementCount &VF) {
196     return getLaneFromEnd(VF, 1);
197   }
198 
199   /// Returns a compile-time known value for the lane index and asserts if the
200   /// lane can only be calculated at runtime.
201   unsigned getKnownLane() const {
202     assert(LaneKind == Kind::First);
203     return Lane;
204   }
205 
206   /// Returns an expression describing the lane index that can be used at
207   /// runtime.
208   Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
209 
210   /// Returns the Kind of lane offset.
211   Kind getKind() const { return LaneKind; }
212 
213   /// Returns true if this is the first lane of the whole vector.
214   bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
215 
216   /// Maps the lane to a cache index based on \p VF.
217   unsigned mapToCacheIndex(const ElementCount &VF) const {
218     switch (LaneKind) {
219     case VPLane::Kind::ScalableLast:
220       assert(VF.isScalable() && Lane < VF.getKnownMinValue());
221       return VF.getKnownMinValue() + Lane;
222     default:
223       assert(Lane < VF.getKnownMinValue());
224       return Lane;
225     }
226   }
227 };
228 
229 /// VPTransformState holds information passed down when "executing" a VPlan,
230 /// needed for generating the output IR.
231 struct VPTransformState {
232   VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF,
233                    LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
234                    InnerLoopVectorizer *ILV, VPlan *Plan,
235                    Loop *CurrentParentLoop, Type *CanonicalIVTy);
236   /// Target Transform Info.
237   const TargetTransformInfo *TTI;
238 
239   /// The chosen Vectorization Factor of the loop being vectorized.
240   ElementCount VF;
241 
242   /// Hold the index to generate specific scalar instructions. Null indicates
243   /// that all instances are to be generated, using either scalar or vector
244   /// instructions.
245   std::optional<VPLane> Lane;
246 
247   struct DataState {
248     // Each value from the original loop, when vectorized, is represented by a
249     // vector value in the map.
250     DenseMap<VPValue *, Value *> VPV2Vector;
251 
252     DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
253   } Data;
254 
255   /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
256   /// is false, otherwise return the generated scalar. \See set.
257   Value *get(VPValue *Def, bool IsScalar = false);
258 
259   /// Get the generated Value for a given VPValue and given Part and Lane.
260   Value *get(VPValue *Def, const VPLane &Lane);
261 
262   bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); }
263 
264   bool hasScalarValue(VPValue *Def, VPLane Lane) {
265     auto I = Data.VPV2Scalars.find(Def);
266     if (I == Data.VPV2Scalars.end())
267       return false;
268     unsigned CacheIdx = Lane.mapToCacheIndex(VF);
269     return CacheIdx < I->second.size() && I->second[CacheIdx];
270   }
271 
272   /// Set the generated vector Value for a given VPValue, if \p
273   /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
274   void set(VPValue *Def, Value *V, bool IsScalar = false) {
275     if (IsScalar) {
276       set(Def, V, VPLane(0));
277       return;
278     }
279     assert((VF.isScalar() || V->getType()->isVectorTy()) &&
280            "scalar values must be stored as (0, 0)");
281     Data.VPV2Vector[Def] = V;
282   }
283 
284   /// Reset an existing vector value for \p Def and a given \p Part.
285   void reset(VPValue *Def, Value *V) {
286     assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
287     Data.VPV2Vector[Def] = V;
288   }
289 
290   /// Set the generated scalar \p V for \p Def and the given \p Lane.
291   void set(VPValue *Def, Value *V, const VPLane &Lane) {
292     auto &Scalars = Data.VPV2Scalars[Def];
293     unsigned CacheIdx = Lane.mapToCacheIndex(VF);
294     if (Scalars.size() <= CacheIdx)
295       Scalars.resize(CacheIdx + 1);
296     assert(!Scalars[CacheIdx] && "should overwrite existing value");
297     Scalars[CacheIdx] = V;
298   }
299 
300   /// Reset an existing scalar value for \p Def and a given \p Lane.
301   void reset(VPValue *Def, Value *V, const VPLane &Lane) {
302     auto Iter = Data.VPV2Scalars.find(Def);
303     assert(Iter != Data.VPV2Scalars.end() &&
304            "need to overwrite existing value");
305     unsigned CacheIdx = Lane.mapToCacheIndex(VF);
306     assert(CacheIdx < Iter->second.size() &&
307            "need to overwrite existing value");
308     Iter->second[CacheIdx] = V;
309   }
310 
311   /// Add additional metadata to \p To that was not present on \p Orig.
312   ///
313   /// Currently this is used to add the noalias annotations based on the
314   /// inserted memchecks.  Use this for instructions that are *cloned* into the
315   /// vector loop.
316   void addNewMetadata(Instruction *To, const Instruction *Orig);
317 
318   /// Add metadata from one instruction to another.
319   ///
320   /// This includes both the original MDs from \p From and additional ones (\see
321   /// addNewMetadata).  Use this for *newly created* instructions in the vector
322   /// loop.
323   void addMetadata(Value *To, Instruction *From);
324 
325   /// Set the debug location in the builder using the debug location \p DL.
326   void setDebugLocFrom(DebugLoc DL);
327 
328   /// Construct the vector value of a scalarized value \p V one lane at a time.
329   void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
330 
331   /// Hold state information used when constructing the CFG of the output IR,
332   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
333   struct CFGState {
334     /// The previous VPBasicBlock visited. Initially set to null.
335     VPBasicBlock *PrevVPBB = nullptr;
336 
337     /// The previous IR BasicBlock created or used. Initially set to the new
338     /// header BasicBlock.
339     BasicBlock *PrevBB = nullptr;
340 
341     /// The last IR BasicBlock in the output IR. Set to the exit block of the
342     /// vector loop.
343     BasicBlock *ExitBB = nullptr;
344 
345     /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
346     /// of replication, maps the BasicBlock of the last replica created.
347     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
348 
349     /// Updater for the DominatorTree.
350     DomTreeUpdater DTU;
351 
352     CFGState(DominatorTree *DT)
353         : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
354 
355     /// Returns the BasicBlock* mapped to the pre-header of the loop region
356     /// containing \p R.
357     BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
358   } CFG;
359 
360   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
361   LoopInfo *LI;
362 
363   /// Hold a reference to the IRBuilder used to generate output IR code.
364   IRBuilderBase &Builder;
365 
366   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
367   InnerLoopVectorizer *ILV;
368 
369   /// Pointer to the VPlan code is generated for.
370   VPlan *Plan;
371 
372   /// The parent loop object for the current scope, or nullptr.
373   Loop *CurrentParentLoop = nullptr;
374 
375   /// LoopVersioning.  It's only set up (non-null) if memchecks were
376   /// used.
377   ///
378   /// This is currently only used to add no-alias metadata based on the
379   /// memchecks.  The actually versioning is performed manually.
380   LoopVersioning *LVer = nullptr;
381 
382   /// Map SCEVs to their expanded values. Populated when executing
383   /// VPExpandSCEVRecipes.
384   DenseMap<const SCEV *, Value *> ExpandedSCEVs;
385 
386   /// VPlan-based type analysis.
387   VPTypeAnalysis TypeAnalysis;
388 };
389 
390 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
391 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
392 class VPBlockBase {
393   friend class VPBlockUtils;
394 
395   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
396 
397   /// An optional name for the block.
398   std::string Name;
399 
400   /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
401   /// it is a topmost VPBlockBase.
402   VPRegionBlock *Parent = nullptr;
403 
404   /// List of predecessor blocks.
405   SmallVector<VPBlockBase *, 1> Predecessors;
406 
407   /// List of successor blocks.
408   SmallVector<VPBlockBase *, 1> Successors;
409 
410   /// VPlan containing the block. Can only be set on the entry block of the
411   /// plan.
412   VPlan *Plan = nullptr;
413 
414   /// Add \p Successor as the last successor to this block.
415   void appendSuccessor(VPBlockBase *Successor) {
416     assert(Successor && "Cannot add nullptr successor!");
417     Successors.push_back(Successor);
418   }
419 
420   /// Add \p Predecessor as the last predecessor to this block.
421   void appendPredecessor(VPBlockBase *Predecessor) {
422     assert(Predecessor && "Cannot add nullptr predecessor!");
423     Predecessors.push_back(Predecessor);
424   }
425 
426   /// Remove \p Predecessor from the predecessors of this block.
427   void removePredecessor(VPBlockBase *Predecessor) {
428     auto Pos = find(Predecessors, Predecessor);
429     assert(Pos && "Predecessor does not exist");
430     Predecessors.erase(Pos);
431   }
432 
433   /// Remove \p Successor from the successors of this block.
434   void removeSuccessor(VPBlockBase *Successor) {
435     auto Pos = find(Successors, Successor);
436     assert(Pos && "Successor does not exist");
437     Successors.erase(Pos);
438   }
439 
440   /// This function replaces one predecessor with another, useful when
441   /// trying to replace an old block in the CFG with a new one.
442   void replacePredecessor(VPBlockBase *Old, VPBlockBase *New) {
443     auto I = find(Predecessors, Old);
444     assert(I != Predecessors.end());
445     assert(Old->getParent() == New->getParent() &&
446            "replaced predecessor must have the same parent");
447     *I = New;
448   }
449 
450   /// This function replaces one successor with another, useful when
451   /// trying to replace an old block in the CFG with a new one.
452   void replaceSuccessor(VPBlockBase *Old, VPBlockBase *New) {
453     auto I = find(Successors, Old);
454     assert(I != Successors.end());
455     assert(Old->getParent() == New->getParent() &&
456            "replaced successor must have the same parent");
457     *I = New;
458   }
459 
460 protected:
461   VPBlockBase(const unsigned char SC, const std::string &N)
462       : SubclassID(SC), Name(N) {}
463 
464 public:
465   /// An enumeration for keeping track of the concrete subclass of VPBlockBase
466   /// that are actually instantiated. Values of this enumeration are kept in the
467   /// SubclassID field of the VPBlockBase objects. They are used for concrete
468   /// type identification.
469   using VPBlockTy = enum { VPRegionBlockSC, VPBasicBlockSC, VPIRBasicBlockSC };
470 
471   using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
472 
473   virtual ~VPBlockBase() = default;
474 
475   const std::string &getName() const { return Name; }
476 
477   void setName(const Twine &newName) { Name = newName.str(); }
478 
479   /// \return an ID for the concrete type of this object.
480   /// This is used to implement the classof checks. This should not be used
481   /// for any other purpose, as the values may change as LLVM evolves.
482   unsigned getVPBlockID() const { return SubclassID; }
483 
484   VPRegionBlock *getParent() { return Parent; }
485   const VPRegionBlock *getParent() const { return Parent; }
486 
487   /// \return A pointer to the plan containing the current block.
488   VPlan *getPlan();
489   const VPlan *getPlan() const;
490 
491   /// Sets the pointer of the plan containing the block. The block must be the
492   /// entry block into the VPlan.
493   void setPlan(VPlan *ParentPlan);
494 
495   void setParent(VPRegionBlock *P) { Parent = P; }
496 
497   /// \return the VPBasicBlock that is the entry of this VPBlockBase,
498   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
499   /// VPBlockBase is a VPBasicBlock, it is returned.
500   const VPBasicBlock *getEntryBasicBlock() const;
501   VPBasicBlock *getEntryBasicBlock();
502 
503   /// \return the VPBasicBlock that is the exiting this VPBlockBase,
504   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
505   /// VPBlockBase is a VPBasicBlock, it is returned.
506   const VPBasicBlock *getExitingBasicBlock() const;
507   VPBasicBlock *getExitingBasicBlock();
508 
509   const VPBlocksTy &getSuccessors() const { return Successors; }
510   VPBlocksTy &getSuccessors() { return Successors; }
511 
512   iterator_range<VPBlockBase **> successors() { return Successors; }
513   iterator_range<VPBlockBase **> predecessors() { return Predecessors; }
514 
515   const VPBlocksTy &getPredecessors() const { return Predecessors; }
516   VPBlocksTy &getPredecessors() { return Predecessors; }
517 
518   /// \return the successor of this VPBlockBase if it has a single successor.
519   /// Otherwise return a null pointer.
520   VPBlockBase *getSingleSuccessor() const {
521     return (Successors.size() == 1 ? *Successors.begin() : nullptr);
522   }
523 
524   /// \return the predecessor of this VPBlockBase if it has a single
525   /// predecessor. Otherwise return a null pointer.
526   VPBlockBase *getSinglePredecessor() const {
527     return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
528   }
529 
530   size_t getNumSuccessors() const { return Successors.size(); }
531   size_t getNumPredecessors() const { return Predecessors.size(); }
532 
533   /// An Enclosing Block of a block B is any block containing B, including B
534   /// itself. \return the closest enclosing block starting from "this", which
535   /// has successors. \return the root enclosing block if all enclosing blocks
536   /// have no successors.
537   VPBlockBase *getEnclosingBlockWithSuccessors();
538 
539   /// \return the closest enclosing block starting from "this", which has
540   /// predecessors. \return the root enclosing block if all enclosing blocks
541   /// have no predecessors.
542   VPBlockBase *getEnclosingBlockWithPredecessors();
543 
544   /// \return the successors either attached directly to this VPBlockBase or, if
545   /// this VPBlockBase is the exit block of a VPRegionBlock and has no
546   /// successors of its own, search recursively for the first enclosing
547   /// VPRegionBlock that has successors and return them. If no such
548   /// VPRegionBlock exists, return the (empty) successors of the topmost
549   /// VPBlockBase reached.
550   const VPBlocksTy &getHierarchicalSuccessors() {
551     return getEnclosingBlockWithSuccessors()->getSuccessors();
552   }
553 
554   /// \return the hierarchical successor of this VPBlockBase if it has a single
555   /// hierarchical successor. Otherwise return a null pointer.
556   VPBlockBase *getSingleHierarchicalSuccessor() {
557     return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
558   }
559 
560   /// \return the predecessors either attached directly to this VPBlockBase or,
561   /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
562   /// predecessors of its own, search recursively for the first enclosing
563   /// VPRegionBlock that has predecessors and return them. If no such
564   /// VPRegionBlock exists, return the (empty) predecessors of the topmost
565   /// VPBlockBase reached.
566   const VPBlocksTy &getHierarchicalPredecessors() {
567     return getEnclosingBlockWithPredecessors()->getPredecessors();
568   }
569 
570   /// \return the hierarchical predecessor of this VPBlockBase if it has a
571   /// single hierarchical predecessor. Otherwise return a null pointer.
572   VPBlockBase *getSingleHierarchicalPredecessor() {
573     return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
574   }
575 
576   /// Set a given VPBlockBase \p Successor as the single successor of this
577   /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
578   /// This VPBlockBase must have no successors.
579   void setOneSuccessor(VPBlockBase *Successor) {
580     assert(Successors.empty() && "Setting one successor when others exist.");
581     assert(Successor->getParent() == getParent() &&
582            "connected blocks must have the same parent");
583     appendSuccessor(Successor);
584   }
585 
586   /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
587   /// successors of this VPBlockBase. This VPBlockBase is not added as
588   /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
589   /// successors.
590   void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
591     assert(Successors.empty() && "Setting two successors when others exist.");
592     appendSuccessor(IfTrue);
593     appendSuccessor(IfFalse);
594   }
595 
596   /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
597   /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
598   /// as successor of any VPBasicBlock in \p NewPreds.
599   void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
600     assert(Predecessors.empty() && "Block predecessors already set.");
601     for (auto *Pred : NewPreds)
602       appendPredecessor(Pred);
603   }
604 
605   /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase.
606   /// This VPBlockBase must have no successors. This VPBlockBase is not added
607   /// as predecessor of any VPBasicBlock in \p NewSuccs.
608   void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) {
609     assert(Successors.empty() && "Block successors already set.");
610     for (auto *Succ : NewSuccs)
611       appendSuccessor(Succ);
612   }
613 
614   /// Remove all the predecessor of this block.
615   void clearPredecessors() { Predecessors.clear(); }
616 
617   /// Remove all the successors of this block.
618   void clearSuccessors() { Successors.clear(); }
619 
620   /// Swap successors of the block. The block must have exactly 2 successors.
621   // TODO: This should be part of introducing conditional branch recipes rather
622   // than being independent.
623   void swapSuccessors() {
624     assert(Successors.size() == 2 && "must have 2 successors to swap");
625     std::swap(Successors[0], Successors[1]);
626   }
627 
628   /// The method which generates the output IR that correspond to this
629   /// VPBlockBase, thereby "executing" the VPlan.
630   virtual void execute(VPTransformState *State) = 0;
631 
632   /// Return the cost of the block.
633   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
634 
635   /// Return true if it is legal to hoist instructions into this block.
636   bool isLegalToHoistInto() {
637     // There are currently no constraints that prevent an instruction to be
638     // hoisted into a VPBlockBase.
639     return true;
640   }
641 
642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
643   void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
644     OS << getName();
645   }
646 
647   /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
648   /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
649   /// consequtive numbers.
650   ///
651   /// Note that the numbering is applied to the whole VPlan, so printing
652   /// individual blocks is consistent with the whole VPlan printing.
653   virtual void print(raw_ostream &O, const Twine &Indent,
654                      VPSlotTracker &SlotTracker) const = 0;
655 
656   /// Print plain-text dump of this VPlan to \p O.
657   void print(raw_ostream &O) const {
658     VPSlotTracker SlotTracker(getPlan());
659     print(O, "", SlotTracker);
660   }
661 
662   /// Print the successors of this block to \p O, prefixing all lines with \p
663   /// Indent.
664   void printSuccessors(raw_ostream &O, const Twine &Indent) const;
665 
666   /// Dump this VPBlockBase to dbgs().
667   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
668 #endif
669 
670   /// Clone the current block and it's recipes without updating the operands of
671   /// the cloned recipes, including all blocks in the single-entry single-exit
672   /// region for VPRegionBlocks.
673   virtual VPBlockBase *clone() = 0;
674 };
675 
676 /// Struct to hold various analysis needed for cost computations.
677 struct VPCostContext {
678   const TargetTransformInfo &TTI;
679   const TargetLibraryInfo &TLI;
680   VPTypeAnalysis Types;
681   LLVMContext &LLVMCtx;
682   LoopVectorizationCostModel &CM;
683   SmallPtrSet<Instruction *, 8> SkipCostComputation;
684   TargetTransformInfo::TargetCostKind CostKind;
685 
686   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
687                 Type *CanIVTy, LoopVectorizationCostModel &CM,
688                 TargetTransformInfo::TargetCostKind CostKind)
689       : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
690         CM(CM), CostKind(CostKind) {}
691 
692   /// Return the cost for \p UI with \p VF using the legacy cost model as
693   /// fallback until computing the cost of all recipes migrates to VPlan.
694   InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
695 
696   /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
697   /// has already been pre-computed.
698   bool skipCostComputation(Instruction *UI, bool IsVector) const;
699 
700   /// Returns the OperandInfo for \p V, if it is a live-in.
701   TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
702 };
703 
704 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
705 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
706 /// and is responsible for deleting its defined values. Single-value
707 /// recipes must inherit from VPSingleDef instead of inheriting from both
708 /// VPRecipeBase and VPValue separately.
709 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
710                      public VPDef,
711                      public VPUser {
712   friend VPBasicBlock;
713   friend class VPBlockUtils;
714 
715   /// Each VPRecipe belongs to a single VPBasicBlock.
716   VPBasicBlock *Parent = nullptr;
717 
718   /// The debug location for the recipe.
719   DebugLoc DL;
720 
721 public:
722   VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands,
723                DebugLoc DL = {})
724       : VPDef(SC), VPUser(Operands), DL(DL) {}
725 
726   template <typename IterT>
727   VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands,
728                DebugLoc DL = {})
729       : VPDef(SC), VPUser(Operands), DL(DL) {}
730   virtual ~VPRecipeBase() = default;
731 
732   /// Clone the current recipe.
733   virtual VPRecipeBase *clone() = 0;
734 
735   /// \return the VPBasicBlock which this VPRecipe belongs to.
736   VPBasicBlock *getParent() { return Parent; }
737   const VPBasicBlock *getParent() const { return Parent; }
738 
739   /// The method which generates the output IR instructions that correspond to
740   /// this VPRecipe, thereby "executing" the VPlan.
741   virtual void execute(VPTransformState &State) = 0;
742 
743   /// Return the cost of this recipe, taking into account if the cost
744   /// computation should be skipped and the ForceTargetInstructionCost flag.
745   /// Also takes care of printing the cost for debugging.
746   InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
747 
748   /// Insert an unlinked recipe into a basic block immediately before
749   /// the specified recipe.
750   void insertBefore(VPRecipeBase *InsertPos);
751   /// Insert an unlinked recipe into \p BB immediately before the insertion
752   /// point \p IP;
753   void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
754 
755   /// Insert an unlinked Recipe into a basic block immediately after
756   /// the specified Recipe.
757   void insertAfter(VPRecipeBase *InsertPos);
758 
759   /// Unlink this recipe from its current VPBasicBlock and insert it into
760   /// the VPBasicBlock that MovePos lives in, right after MovePos.
761   void moveAfter(VPRecipeBase *MovePos);
762 
763   /// Unlink this recipe and insert into BB before I.
764   ///
765   /// \pre I is a valid iterator into BB.
766   void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
767 
768   /// This method unlinks 'this' from the containing basic block, but does not
769   /// delete it.
770   void removeFromParent();
771 
772   /// This method unlinks 'this' from the containing basic block and deletes it.
773   ///
774   /// \returns an iterator pointing to the element after the erased one
775   iplist<VPRecipeBase>::iterator eraseFromParent();
776 
777   /// Method to support type inquiry through isa, cast, and dyn_cast.
778   static inline bool classof(const VPDef *D) {
779     // All VPDefs are also VPRecipeBases.
780     return true;
781   }
782 
783   static inline bool classof(const VPUser *U) { return true; }
784 
785   /// Returns true if the recipe may have side-effects.
786   bool mayHaveSideEffects() const;
787 
788   /// Returns true for PHI-like recipes.
789   bool isPhi() const {
790     return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
791   }
792 
793   /// Returns true if the recipe may read from memory.
794   bool mayReadFromMemory() const;
795 
796   /// Returns true if the recipe may write to memory.
797   bool mayWriteToMemory() const;
798 
799   /// Returns true if the recipe may read from or write to memory.
800   bool mayReadOrWriteMemory() const {
801     return mayReadFromMemory() || mayWriteToMemory();
802   }
803 
804   /// Returns the debug location of the recipe.
805   DebugLoc getDebugLoc() const { return DL; }
806 
807 protected:
808   /// Compute the cost of this recipe either using a recipe's specialized
809   /// implementation or using the legacy cost model and the underlying
810   /// instructions.
811   virtual InstructionCost computeCost(ElementCount VF,
812                                       VPCostContext &Ctx) const;
813 };
814 
815 // Helper macro to define common classof implementations for recipes.
816 #define VP_CLASSOF_IMPL(VPDefID)                                               \
817   static inline bool classof(const VPDef *D) {                                 \
818     return D->getVPDefID() == VPDefID;                                         \
819   }                                                                            \
820   static inline bool classof(const VPValue *V) {                               \
821     auto *R = V->getDefiningRecipe();                                          \
822     return R && R->getVPDefID() == VPDefID;                                    \
823   }                                                                            \
824   static inline bool classof(const VPUser *U) {                                \
825     auto *R = dyn_cast<VPRecipeBase>(U);                                       \
826     return R && R->getVPDefID() == VPDefID;                                    \
827   }                                                                            \
828   static inline bool classof(const VPRecipeBase *R) {                          \
829     return R->getVPDefID() == VPDefID;                                         \
830   }                                                                            \
831   static inline bool classof(const VPSingleDefRecipe *R) {                     \
832     return R->getVPDefID() == VPDefID;                                         \
833   }
834 
835 /// VPSingleDef is a base class for recipes for modeling a sequence of one or
836 /// more output IR that define a single result VPValue.
837 /// Note that VPRecipeBase must be inherited from before VPValue.
838 class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
839 public:
840   template <typename IterT>
841   VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {})
842       : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
843 
844   VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
845                     DebugLoc DL = {})
846       : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
847 
848   template <typename IterT>
849   VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV,
850                     DebugLoc DL = {})
851       : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
852 
853   static inline bool classof(const VPRecipeBase *R) {
854     switch (R->getVPDefID()) {
855     case VPRecipeBase::VPDerivedIVSC:
856     case VPRecipeBase::VPEVLBasedIVPHISC:
857     case VPRecipeBase::VPExpandSCEVSC:
858     case VPRecipeBase::VPInstructionSC:
859     case VPRecipeBase::VPReductionEVLSC:
860     case VPRecipeBase::VPReductionSC:
861     case VPRecipeBase::VPReplicateSC:
862     case VPRecipeBase::VPScalarIVStepsSC:
863     case VPRecipeBase::VPVectorPointerSC:
864     case VPRecipeBase::VPReverseVectorPointerSC:
865     case VPRecipeBase::VPWidenCallSC:
866     case VPRecipeBase::VPWidenCanonicalIVSC:
867     case VPRecipeBase::VPWidenCastSC:
868     case VPRecipeBase::VPWidenGEPSC:
869     case VPRecipeBase::VPWidenIntrinsicSC:
870     case VPRecipeBase::VPWidenSC:
871     case VPRecipeBase::VPWidenEVLSC:
872     case VPRecipeBase::VPWidenSelectSC:
873     case VPRecipeBase::VPBlendSC:
874     case VPRecipeBase::VPPredInstPHISC:
875     case VPRecipeBase::VPCanonicalIVPHISC:
876     case VPRecipeBase::VPActiveLaneMaskPHISC:
877     case VPRecipeBase::VPFirstOrderRecurrencePHISC:
878     case VPRecipeBase::VPWidenPHISC:
879     case VPRecipeBase::VPWidenIntOrFpInductionSC:
880     case VPRecipeBase::VPWidenPointerInductionSC:
881     case VPRecipeBase::VPReductionPHISC:
882     case VPRecipeBase::VPScalarCastSC:
883     case VPRecipeBase::VPPartialReductionSC:
884       return true;
885     case VPRecipeBase::VPBranchOnMaskSC:
886     case VPRecipeBase::VPInterleaveSC:
887     case VPRecipeBase::VPIRInstructionSC:
888     case VPRecipeBase::VPWidenLoadEVLSC:
889     case VPRecipeBase::VPWidenLoadSC:
890     case VPRecipeBase::VPWidenStoreEVLSC:
891     case VPRecipeBase::VPWidenStoreSC:
892     case VPRecipeBase::VPHistogramSC:
893       // TODO: Widened stores don't define a value, but widened loads do. Split
894       // the recipes to be able to make widened loads VPSingleDefRecipes.
895       return false;
896     }
897     llvm_unreachable("Unhandled VPDefID");
898   }
899 
900   static inline bool classof(const VPUser *U) {
901     auto *R = dyn_cast<VPRecipeBase>(U);
902     return R && classof(R);
903   }
904 
905   virtual VPSingleDefRecipe *clone() override = 0;
906 
907   /// Returns the underlying instruction.
908   Instruction *getUnderlyingInstr() {
909     return cast<Instruction>(getUnderlyingValue());
910   }
911   const Instruction *getUnderlyingInstr() const {
912     return cast<Instruction>(getUnderlyingValue());
913   }
914 
915 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
916   /// Print this VPSingleDefRecipe to dbgs() (for debugging).
917   LLVM_DUMP_METHOD void dump() const;
918 #endif
919 };
920 
921 /// Class to record LLVM IR flag for a recipe along with it.
922 class VPRecipeWithIRFlags : public VPSingleDefRecipe {
923   enum class OperationType : unsigned char {
924     Cmp,
925     OverflowingBinOp,
926     DisjointOp,
927     PossiblyExactOp,
928     GEPOp,
929     FPMathOp,
930     NonNegOp,
931     Other
932   };
933 
934 public:
935   struct WrapFlagsTy {
936     char HasNUW : 1;
937     char HasNSW : 1;
938 
939     WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
940   };
941 
942   struct DisjointFlagsTy {
943     char IsDisjoint : 1;
944     DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
945   };
946 
947 private:
948   struct ExactFlagsTy {
949     char IsExact : 1;
950   };
951   struct NonNegFlagsTy {
952     char NonNeg : 1;
953   };
954   struct FastMathFlagsTy {
955     char AllowReassoc : 1;
956     char NoNaNs : 1;
957     char NoInfs : 1;
958     char NoSignedZeros : 1;
959     char AllowReciprocal : 1;
960     char AllowContract : 1;
961     char ApproxFunc : 1;
962 
963     FastMathFlagsTy(const FastMathFlags &FMF);
964   };
965 
966   OperationType OpType;
967 
968   union {
969     CmpInst::Predicate CmpPredicate;
970     WrapFlagsTy WrapFlags;
971     DisjointFlagsTy DisjointFlags;
972     ExactFlagsTy ExactFlags;
973     GEPNoWrapFlags GEPFlags;
974     NonNegFlagsTy NonNegFlags;
975     FastMathFlagsTy FMFs;
976     unsigned AllFlags;
977   };
978 
979 protected:
980   void transferFlags(VPRecipeWithIRFlags &Other) {
981     OpType = Other.OpType;
982     AllFlags = Other.AllFlags;
983   }
984 
985 public:
986   template <typename IterT>
987   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
988       : VPSingleDefRecipe(SC, Operands, DL) {
989     OpType = OperationType::Other;
990     AllFlags = 0;
991   }
992 
993   template <typename IterT>
994   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
995       : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) {
996     if (auto *Op = dyn_cast<CmpInst>(&I)) {
997       OpType = OperationType::Cmp;
998       CmpPredicate = Op->getPredicate();
999     } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) {
1000       OpType = OperationType::DisjointOp;
1001       DisjointFlags.IsDisjoint = Op->isDisjoint();
1002     } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
1003       OpType = OperationType::OverflowingBinOp;
1004       WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
1005     } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
1006       OpType = OperationType::PossiblyExactOp;
1007       ExactFlags.IsExact = Op->isExact();
1008     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
1009       OpType = OperationType::GEPOp;
1010       GEPFlags = GEP->getNoWrapFlags();
1011     } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) {
1012       OpType = OperationType::NonNegOp;
1013       NonNegFlags.NonNeg = PNNI->hasNonNeg();
1014     } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
1015       OpType = OperationType::FPMathOp;
1016       FMFs = Op->getFastMathFlags();
1017     } else {
1018       OpType = OperationType::Other;
1019       AllFlags = 0;
1020     }
1021   }
1022 
1023   template <typename IterT>
1024   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1025                       CmpInst::Predicate Pred, DebugLoc DL = {})
1026       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp),
1027         CmpPredicate(Pred) {}
1028 
1029   template <typename IterT>
1030   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1031                       WrapFlagsTy WrapFlags, DebugLoc DL = {})
1032       : VPSingleDefRecipe(SC, Operands, DL),
1033         OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
1034 
1035   template <typename IterT>
1036   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1037                       FastMathFlags FMFs, DebugLoc DL = {})
1038       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp),
1039         FMFs(FMFs) {}
1040 
1041   template <typename IterT>
1042   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1043                       DisjointFlagsTy DisjointFlags, DebugLoc DL = {})
1044       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
1045         DisjointFlags(DisjointFlags) {}
1046 
1047 protected:
1048   template <typename IterT>
1049   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
1050                       GEPNoWrapFlags GEPFlags, DebugLoc DL = {})
1051       : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
1052         GEPFlags(GEPFlags) {}
1053 
1054 public:
1055   static inline bool classof(const VPRecipeBase *R) {
1056     return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
1057            R->getVPDefID() == VPRecipeBase::VPWidenSC ||
1058            R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
1059            R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
1060            R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1061            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
1062            R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC ||
1063            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
1064   }
1065 
1066   static inline bool classof(const VPUser *U) {
1067     auto *R = dyn_cast<VPRecipeBase>(U);
1068     return R && classof(R);
1069   }
1070 
1071   /// Drop all poison-generating flags.
1072   void dropPoisonGeneratingFlags() {
1073     // NOTE: This needs to be kept in-sync with
1074     // Instruction::dropPoisonGeneratingFlags.
1075     switch (OpType) {
1076     case OperationType::OverflowingBinOp:
1077       WrapFlags.HasNUW = false;
1078       WrapFlags.HasNSW = false;
1079       break;
1080     case OperationType::DisjointOp:
1081       DisjointFlags.IsDisjoint = false;
1082       break;
1083     case OperationType::PossiblyExactOp:
1084       ExactFlags.IsExact = false;
1085       break;
1086     case OperationType::GEPOp:
1087       GEPFlags = GEPNoWrapFlags::none();
1088       break;
1089     case OperationType::FPMathOp:
1090       FMFs.NoNaNs = false;
1091       FMFs.NoInfs = false;
1092       break;
1093     case OperationType::NonNegOp:
1094       NonNegFlags.NonNeg = false;
1095       break;
1096     case OperationType::Cmp:
1097     case OperationType::Other:
1098       break;
1099     }
1100   }
1101 
1102   /// Set the IR flags for \p I.
1103   void setFlags(Instruction *I) const {
1104     switch (OpType) {
1105     case OperationType::OverflowingBinOp:
1106       I->setHasNoUnsignedWrap(WrapFlags.HasNUW);
1107       I->setHasNoSignedWrap(WrapFlags.HasNSW);
1108       break;
1109     case OperationType::DisjointOp:
1110       cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint);
1111       break;
1112     case OperationType::PossiblyExactOp:
1113       I->setIsExact(ExactFlags.IsExact);
1114       break;
1115     case OperationType::GEPOp:
1116       cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags);
1117       break;
1118     case OperationType::FPMathOp:
1119       I->setHasAllowReassoc(FMFs.AllowReassoc);
1120       I->setHasNoNaNs(FMFs.NoNaNs);
1121       I->setHasNoInfs(FMFs.NoInfs);
1122       I->setHasNoSignedZeros(FMFs.NoSignedZeros);
1123       I->setHasAllowReciprocal(FMFs.AllowReciprocal);
1124       I->setHasAllowContract(FMFs.AllowContract);
1125       I->setHasApproxFunc(FMFs.ApproxFunc);
1126       break;
1127     case OperationType::NonNegOp:
1128       I->setNonNeg(NonNegFlags.NonNeg);
1129       break;
1130     case OperationType::Cmp:
1131     case OperationType::Other:
1132       break;
1133     }
1134   }
1135 
1136   CmpInst::Predicate getPredicate() const {
1137     assert(OpType == OperationType::Cmp &&
1138            "recipe doesn't have a compare predicate");
1139     return CmpPredicate;
1140   }
1141 
1142   GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
1143 
1144   /// Returns true if the recipe has fast-math flags.
1145   bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
1146 
1147   FastMathFlags getFastMathFlags() const;
1148 
1149   bool hasNoUnsignedWrap() const {
1150     assert(OpType == OperationType::OverflowingBinOp &&
1151            "recipe doesn't have a NUW flag");
1152     return WrapFlags.HasNUW;
1153   }
1154 
1155   bool hasNoSignedWrap() const {
1156     assert(OpType == OperationType::OverflowingBinOp &&
1157            "recipe doesn't have a NSW flag");
1158     return WrapFlags.HasNSW;
1159   }
1160 
1161   bool isDisjoint() const {
1162     assert(OpType == OperationType::DisjointOp &&
1163            "recipe cannot have a disjoing flag");
1164     return DisjointFlags.IsDisjoint;
1165   }
1166 
1167 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1168   void printFlags(raw_ostream &O) const;
1169 #endif
1170 };
1171 
1172 /// Helper to access the operand that contains the unroll part for this recipe
1173 /// after unrolling.
1174 template <unsigned PartOpIdx> class VPUnrollPartAccessor {
1175 protected:
1176   /// Return the VPValue operand containing the unroll part or null if there is
1177   /// no such operand.
1178   VPValue *getUnrollPartOperand(VPUser &U) const;
1179 
1180   /// Return the unroll part.
1181   unsigned getUnrollPart(VPUser &U) const;
1182 };
1183 
1184 /// This is a concrete Recipe that models a single VPlan-level instruction.
1185 /// While as any Recipe it may generate a sequence of IR instructions when
1186 /// executed, these instructions would always form a single-def expression as
1187 /// the VPInstruction is also a single def-use vertex.
1188 class VPInstruction : public VPRecipeWithIRFlags,
1189                       public VPUnrollPartAccessor<1> {
1190   friend class VPlanSlp;
1191 
1192 public:
1193   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
1194   enum {
1195     FirstOrderRecurrenceSplice =
1196         Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
1197                                       // values of a first-order recurrence.
1198     Not,
1199     SLPLoad,
1200     SLPStore,
1201     ActiveLaneMask,
1202     ExplicitVectorLength,
1203     /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
1204     /// The first operand is the incoming value from the predecessor in VPlan,
1205     /// the second operand is the incoming value for all other predecessors
1206     /// (which are currently not modeled in VPlan).
1207     ResumePhi,
1208     CalculateTripCountMinusVF,
1209     // Increment the canonical IV separately for each unrolled part.
1210     CanonicalIVIncrementForPart,
1211     BranchOnCount,
1212     BranchOnCond,
1213     ComputeReductionResult,
1214     // Takes the VPValue to extract from as first operand and the lane or part
1215     // to extract as second operand, counting from the end starting with 1 for
1216     // last. The second operand must be a positive constant and <= VF.
1217     ExtractFromEnd,
1218     LogicalAnd, // Non-poison propagating logical And.
1219     // Add an offset in bytes (second operand) to a base pointer (first
1220     // operand). Only generates scalar values (either for the first lane only or
1221     // for all lanes, depending on its uses).
1222     PtrAdd,
1223     // Returns a scalar boolean value, which is true if any lane of its (only
1224     // boolean) vector operand is true.
1225     AnyOf,
1226   };
1227 
1228 private:
1229   typedef unsigned char OpcodeTy;
1230   OpcodeTy Opcode;
1231 
1232   /// An optional name that can be used for the generated IR instruction.
1233   const std::string Name;
1234 
1235   /// Returns true if this VPInstruction generates scalar values for all lanes.
1236   /// Most VPInstructions generate a single value per part, either vector or
1237   /// scalar. VPReplicateRecipe takes care of generating multiple (scalar)
1238   /// values per all lanes, stemming from an original ingredient. This method
1239   /// identifies the (rare) cases of VPInstructions that do so as well, w/o an
1240   /// underlying ingredient.
1241   bool doesGeneratePerAllLanes() const;
1242 
1243   /// Returns true if we can generate a scalar for the first lane only if
1244   /// needed.
1245   bool canGenerateScalarForFirstLane() const;
1246 
1247   /// Utility methods serving execute(): generates a single vector instance of
1248   /// the modeled instruction. \returns the generated value. . In some cases an
1249   /// existing value is returned rather than a generated one.
1250   Value *generate(VPTransformState &State);
1251 
1252   /// Utility methods serving execute(): generates a scalar single instance of
1253   /// the modeled instruction for a given lane. \returns the scalar generated
1254   /// value for lane \p Lane.
1255   Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
1256 
1257 #if !defined(NDEBUG)
1258   /// Return true if the VPInstruction is a floating point math operation, i.e.
1259   /// has fast-math flags.
1260   bool isFPMathOp() const;
1261 #endif
1262 
1263 public:
1264   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
1265                 const Twine &Name = "")
1266       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
1267         Opcode(Opcode), Name(Name.str()) {}
1268 
1269   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1270                 DebugLoc DL = {}, const Twine &Name = "")
1271       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
1272 
1273   VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A,
1274                 VPValue *B, DebugLoc DL = {}, const Twine &Name = "");
1275 
1276   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1277                 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
1278       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
1279         Opcode(Opcode), Name(Name.str()) {}
1280 
1281   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1282                 DisjointFlagsTy DisjointFlag, DebugLoc DL = {},
1283                 const Twine &Name = "")
1284       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL),
1285         Opcode(Opcode), Name(Name.str()) {
1286     assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
1287   }
1288 
1289   VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags,
1290                 DebugLoc DL = {}, const Twine &Name = "")
1291       : VPRecipeWithIRFlags(VPDef::VPInstructionSC,
1292                             ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
1293         Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
1294 
1295   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
1296                 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
1297 
1298   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
1299 
1300   VPInstruction *clone() override {
1301     SmallVector<VPValue *, 2> Operands(operands());
1302     auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name);
1303     New->transferFlags(*this);
1304     return New;
1305   }
1306 
1307   unsigned getOpcode() const { return Opcode; }
1308 
1309   /// Generate the instruction.
1310   /// TODO: We currently execute only per-part unless a specific instance is
1311   /// provided.
1312   void execute(VPTransformState &State) override;
1313 
1314   /// Return the cost of this VPInstruction.
1315   InstructionCost computeCost(ElementCount VF,
1316                               VPCostContext &Ctx) const override {
1317     // TODO: Compute accurate cost after retiring the legacy cost model.
1318     return 0;
1319   }
1320 
1321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1322   /// Print the VPInstruction to \p O.
1323   void print(raw_ostream &O, const Twine &Indent,
1324              VPSlotTracker &SlotTracker) const override;
1325 
1326   /// Print the VPInstruction to dbgs() (for debugging).
1327   LLVM_DUMP_METHOD void dump() const;
1328 #endif
1329 
1330   bool hasResult() const {
1331     // CallInst may or may not have a result, depending on the called function.
1332     // Conservatively return calls have results for now.
1333     switch (getOpcode()) {
1334     case Instruction::Ret:
1335     case Instruction::Br:
1336     case Instruction::Store:
1337     case Instruction::Switch:
1338     case Instruction::IndirectBr:
1339     case Instruction::Resume:
1340     case Instruction::CatchRet:
1341     case Instruction::Unreachable:
1342     case Instruction::Fence:
1343     case Instruction::AtomicRMW:
1344     case VPInstruction::BranchOnCond:
1345     case VPInstruction::BranchOnCount:
1346       return false;
1347     default:
1348       return true;
1349     }
1350   }
1351 
1352   /// Returns true if the underlying opcode may read from or write to memory.
1353   bool opcodeMayReadOrWriteFromMemory() const;
1354 
1355   /// Returns true if the recipe only uses the first lane of operand \p Op.
1356   bool onlyFirstLaneUsed(const VPValue *Op) const override;
1357 
1358   /// Returns true if the recipe only uses the first part of operand \p Op.
1359   bool onlyFirstPartUsed(const VPValue *Op) const override;
1360 
1361   /// Returns true if this VPInstruction produces a scalar value from a vector,
1362   /// e.g. by performing a reduction or extracting a lane.
1363   bool isVectorToScalar() const;
1364 
1365   /// Returns true if this VPInstruction's operands are single scalars and the
1366   /// result is also a single scalar.
1367   bool isSingleScalar() const;
1368 
1369   /// Returns the symbolic name assigned to the VPInstruction.
1370   StringRef getName() const { return Name; }
1371 };
1372 
1373 /// A recipe to wrap on original IR instruction not to be modified during
1374 /// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed,
1375 /// and it is used to add a new incoming value for the single predecessor VPBB.
1376 /// Expect PHIs, VPIRInstructions cannot have any operands.
1377 class VPIRInstruction : public VPRecipeBase {
1378   Instruction &I;
1379 
1380 public:
1381   VPIRInstruction(Instruction &I)
1382       : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef<VPValue *>()), I(I) {}
1383 
1384   ~VPIRInstruction() override = default;
1385 
1386   VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC)
1387 
1388   VPIRInstruction *clone() override {
1389     auto *R = new VPIRInstruction(I);
1390     for (auto *Op : operands())
1391       R->addOperand(Op);
1392     return R;
1393   }
1394 
1395   void execute(VPTransformState &State) override;
1396 
1397   /// Return the cost of this VPIRInstruction.
1398   InstructionCost computeCost(ElementCount VF,
1399                               VPCostContext &Ctx) const override;
1400 
1401   Instruction &getInstruction() const { return I; }
1402 
1403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1404   /// Print the recipe.
1405   void print(raw_ostream &O, const Twine &Indent,
1406              VPSlotTracker &SlotTracker) const override;
1407 #endif
1408 
1409   bool usesScalars(const VPValue *Op) const override {
1410     assert(is_contained(operands(), Op) &&
1411            "Op must be an operand of the recipe");
1412     return true;
1413   }
1414 
1415   bool onlyFirstPartUsed(const VPValue *Op) const override {
1416     assert(is_contained(operands(), Op) &&
1417            "Op must be an operand of the recipe");
1418     return true;
1419   }
1420 
1421   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1422     assert(is_contained(operands(), Op) &&
1423            "Op must be an operand of the recipe");
1424     return true;
1425   }
1426 
1427   /// Update the recipes single operand to the last lane of the operand using \p
1428   /// Builder. Must only be used for single operand VPIRInstructions wrapping a
1429   /// PHINode.
1430   void extractLastLaneOfOperand(VPBuilder &Builder);
1431 };
1432 
1433 /// VPWidenRecipe is a recipe for producing a widened instruction using the
1434 /// opcode and operands of the recipe. This recipe covers most of the
1435 /// traditional vectorization cases where each recipe transforms into a
1436 /// vectorized version of itself.
1437 class VPWidenRecipe : public VPRecipeWithIRFlags {
1438   unsigned Opcode;
1439 
1440 protected:
1441   template <typename IterT>
1442   VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
1443                 iterator_range<IterT> Operands)
1444       : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
1445 
1446 public:
1447   template <typename IterT>
1448   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
1449       : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
1450 
1451   ~VPWidenRecipe() override = default;
1452 
1453   VPWidenRecipe *clone() override {
1454     auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
1455     R->transferFlags(*this);
1456     return R;
1457   }
1458 
1459   static inline bool classof(const VPRecipeBase *R) {
1460     return R->getVPDefID() == VPRecipeBase::VPWidenSC ||
1461            R->getVPDefID() == VPRecipeBase::VPWidenEVLSC;
1462   }
1463 
1464   static inline bool classof(const VPUser *U) {
1465     auto *R = dyn_cast<VPRecipeBase>(U);
1466     return R && classof(R);
1467   }
1468 
1469   /// Produce a widened instruction using the opcode and operands of the recipe,
1470   /// processing State.VF elements.
1471   void execute(VPTransformState &State) override;
1472 
1473   /// Return the cost of this VPWidenRecipe.
1474   InstructionCost computeCost(ElementCount VF,
1475                               VPCostContext &Ctx) const override;
1476 
1477   unsigned getOpcode() const { return Opcode; }
1478 
1479 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480   /// Print the recipe.
1481   void print(raw_ostream &O, const Twine &Indent,
1482              VPSlotTracker &SlotTracker) const override;
1483 #endif
1484 };
1485 
1486 /// A recipe for widening operations with vector-predication intrinsics with
1487 /// explicit vector length (EVL).
1488 class VPWidenEVLRecipe : public VPWidenRecipe {
1489   using VPRecipeWithIRFlags::transferFlags;
1490 
1491 public:
1492   template <typename IterT>
1493   VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
1494       : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) {
1495     addOperand(&EVL);
1496   }
1497   VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL)
1498       : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) {
1499     transferFlags(W);
1500   }
1501 
1502   ~VPWidenEVLRecipe() override = default;
1503 
1504   VPWidenRecipe *clone() override final {
1505     llvm_unreachable("VPWidenEVLRecipe cannot be cloned");
1506     return nullptr;
1507   }
1508 
1509   VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC);
1510 
1511   VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
1512   const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
1513 
1514   /// Produce a vp-intrinsic using the opcode and operands of the recipe,
1515   /// processing EVL elements.
1516   void execute(VPTransformState &State) override final;
1517 
1518   /// Returns true if the recipe only uses the first lane of operand \p Op.
1519   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1520     assert(is_contained(operands(), Op) &&
1521            "Op must be an operand of the recipe");
1522     // EVL in that recipe is always the last operand, thus any use before means
1523     // the VPValue should be vectorized.
1524     return getEVL() == Op;
1525   }
1526 
1527 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1528   /// Print the recipe.
1529   void print(raw_ostream &O, const Twine &Indent,
1530              VPSlotTracker &SlotTracker) const override final;
1531 #endif
1532 };
1533 
1534 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
1535 class VPWidenCastRecipe : public VPRecipeWithIRFlags {
1536   /// Cast instruction opcode.
1537   Instruction::CastOps Opcode;
1538 
1539   /// Result type for the cast.
1540   Type *ResultTy;
1541 
1542 public:
1543   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1544                     CastInst &UI)
1545       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
1546         ResultTy(ResultTy) {
1547     assert(UI.getOpcode() == Opcode &&
1548            "opcode of underlying cast doesn't match");
1549   }
1550 
1551   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1552       : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
1553         ResultTy(ResultTy) {}
1554 
1555   ~VPWidenCastRecipe() override = default;
1556 
1557   VPWidenCastRecipe *clone() override {
1558     if (auto *UV = getUnderlyingValue())
1559       return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy,
1560                                    *cast<CastInst>(UV));
1561 
1562     return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
1563   }
1564 
1565   VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
1566 
1567   /// Produce widened copies of the cast.
1568   void execute(VPTransformState &State) override;
1569 
1570   /// Return the cost of this VPWidenCastRecipe.
1571   InstructionCost computeCost(ElementCount VF,
1572                               VPCostContext &Ctx) const override;
1573 
1574 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1575   /// Print the recipe.
1576   void print(raw_ostream &O, const Twine &Indent,
1577              VPSlotTracker &SlotTracker) const override;
1578 #endif
1579 
1580   Instruction::CastOps getOpcode() const { return Opcode; }
1581 
1582   /// Returns the result type of the cast.
1583   Type *getResultType() const { return ResultTy; }
1584 };
1585 
1586 /// VPScalarCastRecipe is a recipe to create scalar cast instructions.
1587 class VPScalarCastRecipe : public VPSingleDefRecipe {
1588   Instruction::CastOps Opcode;
1589 
1590   Type *ResultTy;
1591 
1592   Value *generate(VPTransformState &State);
1593 
1594 public:
1595   VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1596                      DebugLoc DL)
1597       : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode),
1598         ResultTy(ResultTy) {}
1599 
1600   ~VPScalarCastRecipe() override = default;
1601 
1602   VPScalarCastRecipe *clone() override {
1603     return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy,
1604                                   getDebugLoc());
1605   }
1606 
1607   VP_CLASSOF_IMPL(VPDef::VPScalarCastSC)
1608 
1609   void execute(VPTransformState &State) override;
1610 
1611   /// Return the cost of this VPScalarCastRecipe.
1612   InstructionCost computeCost(ElementCount VF,
1613                               VPCostContext &Ctx) const override {
1614     // TODO: Compute accurate cost after retiring the legacy cost model.
1615     return 0;
1616   }
1617 
1618 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1619   void print(raw_ostream &O, const Twine &Indent,
1620              VPSlotTracker &SlotTracker) const override;
1621 #endif
1622 
1623   /// Returns the result type of the cast.
1624   Type *getResultType() const { return ResultTy; }
1625 
1626   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1627     // At the moment, only uniform codegen is implemented.
1628     assert(is_contained(operands(), Op) &&
1629            "Op must be an operand of the recipe");
1630     return true;
1631   }
1632 };
1633 
1634 /// A recipe for widening vector intrinsics.
1635 class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
1636   /// ID of the vector intrinsic to widen.
1637   Intrinsic::ID VectorIntrinsicID;
1638 
1639   /// Scalar return type of the intrinsic.
1640   Type *ResultTy;
1641 
1642   /// True if the intrinsic may read from memory.
1643   bool MayReadFromMemory;
1644 
1645   /// True if the intrinsic may read write to memory.
1646   bool MayWriteToMemory;
1647 
1648   /// True if the intrinsic may have side-effects.
1649   bool MayHaveSideEffects;
1650 
1651 public:
1652   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
1653                          ArrayRef<VPValue *> CallArguments, Type *Ty,
1654                          DebugLoc DL = {})
1655       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
1656         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
1657         MayReadFromMemory(CI.mayReadFromMemory()),
1658         MayWriteToMemory(CI.mayWriteToMemory()),
1659         MayHaveSideEffects(CI.mayHaveSideEffects()) {}
1660 
1661   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
1662                          ArrayRef<VPValue *> CallArguments, Type *Ty,
1663                          DebugLoc DL = {})
1664       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
1665         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
1666     LLVMContext &Ctx = Ty->getContext();
1667     AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
1668     MemoryEffects ME = Attrs.getMemoryEffects();
1669     MayReadFromMemory = ME.onlyWritesMemory();
1670     MayWriteToMemory = ME.onlyReadsMemory();
1671     MayHaveSideEffects = MayWriteToMemory ||
1672                          !Attrs.hasFnAttr(Attribute::NoUnwind) ||
1673                          !Attrs.hasFnAttr(Attribute::WillReturn);
1674   }
1675 
1676   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
1677                          std::initializer_list<VPValue *> CallArguments,
1678                          Type *Ty, DebugLoc DL = {})
1679       : VPWidenIntrinsicRecipe(VectorIntrinsicID,
1680                                ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
1681 
1682   ~VPWidenIntrinsicRecipe() override = default;
1683 
1684   VPWidenIntrinsicRecipe *clone() override {
1685     return new VPWidenIntrinsicRecipe(*cast<CallInst>(getUnderlyingValue()),
1686                                       VectorIntrinsicID, {op_begin(), op_end()},
1687                                       ResultTy, getDebugLoc());
1688   }
1689 
1690   VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
1691 
1692   /// Produce a widened version of the vector intrinsic.
1693   void execute(VPTransformState &State) override;
1694 
1695   /// Return the cost of this vector intrinsic.
1696   InstructionCost computeCost(ElementCount VF,
1697                               VPCostContext &Ctx) const override;
1698 
1699   /// Return the ID of the intrinsic.
1700   Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; }
1701 
1702   /// Return the scalar return type of the intrinsic.
1703   Type *getResultType() const { return ResultTy; }
1704 
1705   /// Return to name of the intrinsic as string.
1706   StringRef getIntrinsicName() const;
1707 
1708   /// Returns true if the intrinsic may read from memory.
1709   bool mayReadFromMemory() const { return MayReadFromMemory; }
1710 
1711   /// Returns true if the intrinsic may write to memory.
1712   bool mayWriteToMemory() const { return MayWriteToMemory; }
1713 
1714   /// Returns true if the intrinsic may have side-effects.
1715   bool mayHaveSideEffects() const { return MayHaveSideEffects; }
1716 
1717 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1718   /// Print the recipe.
1719   void print(raw_ostream &O, const Twine &Indent,
1720              VPSlotTracker &SlotTracker) const override;
1721 #endif
1722 
1723   bool onlyFirstLaneUsed(const VPValue *Op) const override;
1724 };
1725 
1726 /// A recipe for widening Call instructions using library calls.
1727 class VPWidenCallRecipe : public VPRecipeWithIRFlags {
1728   /// Variant stores a pointer to the chosen function. There is a 1:1 mapping
1729   /// between a given VF and the chosen vectorized variant, so there will be a
1730   /// different VPlan for each VF with a valid variant.
1731   Function *Variant;
1732 
1733 public:
1734   VPWidenCallRecipe(Value *UV, Function *Variant,
1735                     ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
1736       : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
1737                             *cast<Instruction>(UV)),
1738         Variant(Variant) {
1739     assert(
1740         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
1741         "last operand must be the called function");
1742   }
1743 
1744   ~VPWidenCallRecipe() override = default;
1745 
1746   VPWidenCallRecipe *clone() override {
1747     return new VPWidenCallRecipe(getUnderlyingValue(), Variant,
1748                                  {op_begin(), op_end()}, getDebugLoc());
1749   }
1750 
1751   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
1752 
1753   /// Produce a widened version of the call instruction.
1754   void execute(VPTransformState &State) override;
1755 
1756   /// Return the cost of this VPWidenCallRecipe.
1757   InstructionCost computeCost(ElementCount VF,
1758                               VPCostContext &Ctx) const override;
1759 
1760   Function *getCalledScalarFunction() const {
1761     return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
1762   }
1763 
1764   operand_range arg_operands() {
1765     return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1766   }
1767   const_operand_range arg_operands() const {
1768     return make_range(op_begin(), op_begin() + getNumOperands() - 1);
1769   }
1770 
1771 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1772   /// Print the recipe.
1773   void print(raw_ostream &O, const Twine &Indent,
1774              VPSlotTracker &SlotTracker) const override;
1775 #endif
1776 };
1777 
1778 /// A recipe representing a sequence of load -> update -> store as part of
1779 /// a histogram operation. This means there may be aliasing between vector
1780 /// lanes, which is handled by the llvm.experimental.vector.histogram family
1781 /// of intrinsics. The only update operations currently supported are
1782 /// 'add' and 'sub' where the other term is loop-invariant.
1783 class VPHistogramRecipe : public VPRecipeBase {
1784   /// Opcode of the update operation, currently either add or sub.
1785   unsigned Opcode;
1786 
1787 public:
1788   template <typename IterT>
1789   VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands,
1790                     DebugLoc DL = {})
1791       : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}
1792 
1793   ~VPHistogramRecipe() override = default;
1794 
1795   VPHistogramRecipe *clone() override {
1796     return new VPHistogramRecipe(Opcode, operands(), getDebugLoc());
1797   }
1798 
1799   VP_CLASSOF_IMPL(VPDef::VPHistogramSC);
1800 
1801   /// Produce a vectorized histogram operation.
1802   void execute(VPTransformState &State) override;
1803 
1804   /// Return the cost of this VPHistogramRecipe.
1805   InstructionCost computeCost(ElementCount VF,
1806                               VPCostContext &Ctx) const override;
1807 
1808   unsigned getOpcode() const { return Opcode; }
1809 
1810   /// Return the mask operand if one was provided, or a null pointer if all
1811   /// lanes should be executed unconditionally.
1812   VPValue *getMask() const {
1813     return getNumOperands() == 3 ? getOperand(2) : nullptr;
1814   }
1815 
1816 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1817   /// Print the recipe
1818   void print(raw_ostream &O, const Twine &Indent,
1819              VPSlotTracker &SlotTracker) const override;
1820 #endif
1821 };
1822 
1823 /// A recipe for widening select instructions.
1824 struct VPWidenSelectRecipe : public VPRecipeWithIRFlags {
1825   template <typename IterT>
1826   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
1827       : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I) {}
1828 
1829   ~VPWidenSelectRecipe() override = default;
1830 
1831   VPWidenSelectRecipe *clone() override {
1832     return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
1833                                    operands());
1834   }
1835 
1836   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
1837 
1838   /// Produce a widened version of the select instruction.
1839   void execute(VPTransformState &State) override;
1840 
1841   /// Return the cost of this VPWidenSelectRecipe.
1842   InstructionCost computeCost(ElementCount VF,
1843                               VPCostContext &Ctx) const override;
1844 
1845 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1846   /// Print the recipe.
1847   void print(raw_ostream &O, const Twine &Indent,
1848              VPSlotTracker &SlotTracker) const override;
1849 #endif
1850 
1851   VPValue *getCond() const {
1852     return getOperand(0);
1853   }
1854 
1855   bool isInvariantCond() const {
1856     return getCond()->isDefinedOutsideLoopRegions();
1857   }
1858 };
1859 
1860 /// A recipe for handling GEP instructions.
1861 class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
1862   bool isPointerLoopInvariant() const {
1863     return getOperand(0)->isDefinedOutsideLoopRegions();
1864   }
1865 
1866   bool isIndexLoopInvariant(unsigned I) const {
1867     return getOperand(I + 1)->isDefinedOutsideLoopRegions();
1868   }
1869 
1870   bool areAllOperandsInvariant() const {
1871     return all_of(operands(), [](VPValue *Op) {
1872       return Op->isDefinedOutsideLoopRegions();
1873     });
1874   }
1875 
1876 public:
1877   template <typename IterT>
1878   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
1879       : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
1880 
1881   ~VPWidenGEPRecipe() override = default;
1882 
1883   VPWidenGEPRecipe *clone() override {
1884     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()),
1885                                 operands());
1886   }
1887 
1888   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
1889 
1890   /// Generate the gep nodes.
1891   void execute(VPTransformState &State) override;
1892 
1893   /// Return the cost of this VPWidenGEPRecipe.
1894   InstructionCost computeCost(ElementCount VF,
1895                               VPCostContext &Ctx) const override {
1896     // TODO: Compute accurate cost after retiring the legacy cost model.
1897     return 0;
1898   }
1899 
1900 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1901   /// Print the recipe.
1902   void print(raw_ostream &O, const Twine &Indent,
1903              VPSlotTracker &SlotTracker) const override;
1904 #endif
1905 };
1906 
1907 /// A recipe to compute the pointers for widened memory accesses of IndexTy
1908 /// in reverse order.
1909 class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags,
1910                                      public VPUnrollPartAccessor<2> {
1911   Type *IndexedTy;
1912 
1913 public:
1914   VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
1915                                GEPNoWrapFlags GEPFlags, DebugLoc DL)
1916       : VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC,
1917                             ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
1918         IndexedTy(IndexedTy) {}
1919 
1920   VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC)
1921 
1922   VPValue *getVFValue() { return getOperand(1); }
1923   const VPValue *getVFValue() const { return getOperand(1); }
1924 
1925   void execute(VPTransformState &State) override;
1926 
1927   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1928     assert(is_contained(operands(), Op) &&
1929            "Op must be an operand of the recipe");
1930     return true;
1931   }
1932 
1933   /// Return the cost of this VPVectorPointerRecipe.
1934   InstructionCost computeCost(ElementCount VF,
1935                               VPCostContext &Ctx) const override {
1936     // TODO: Compute accurate cost after retiring the legacy cost model.
1937     return 0;
1938   }
1939 
1940   /// Returns true if the recipe only uses the first part of operand \p Op.
1941   bool onlyFirstPartUsed(const VPValue *Op) const override {
1942     assert(is_contained(operands(), Op) &&
1943            "Op must be an operand of the recipe");
1944     assert(getNumOperands() <= 2 && "must have at most two operands");
1945     return true;
1946   }
1947 
1948   VPReverseVectorPointerRecipe *clone() override {
1949     return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(),
1950                                             IndexedTy, getGEPNoWrapFlags(),
1951                                             getDebugLoc());
1952   }
1953 
1954 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1955   /// Print the recipe.
1956   void print(raw_ostream &O, const Twine &Indent,
1957              VPSlotTracker &SlotTracker) const override;
1958 #endif
1959 };
1960 
1961 /// A recipe to compute the pointers for widened memory accesses of IndexTy.
1962 class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
1963                               public VPUnrollPartAccessor<1> {
1964   Type *IndexedTy;
1965 
1966 public:
1967   VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
1968                         DebugLoc DL)
1969       : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1970                             GEPFlags, DL),
1971         IndexedTy(IndexedTy) {}
1972 
1973   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
1974 
1975   void execute(VPTransformState &State) override;
1976 
1977   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1978     assert(is_contained(operands(), Op) &&
1979            "Op must be an operand of the recipe");
1980     return true;
1981   }
1982 
1983   /// Returns true if the recipe only uses the first part of operand \p Op.
1984   bool onlyFirstPartUsed(const VPValue *Op) const override {
1985     assert(is_contained(operands(), Op) &&
1986            "Op must be an operand of the recipe");
1987     assert(getNumOperands() <= 2 && "must have at most two operands");
1988     return true;
1989   }
1990 
1991   VPVectorPointerRecipe *clone() override {
1992     return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
1993                                      getGEPNoWrapFlags(), getDebugLoc());
1994   }
1995 
1996   /// Return the cost of this VPHeaderPHIRecipe.
1997   InstructionCost computeCost(ElementCount VF,
1998                               VPCostContext &Ctx) const override {
1999     // TODO: Compute accurate cost after retiring the legacy cost model.
2000     return 0;
2001   }
2002 
2003 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2004   /// Print the recipe.
2005   void print(raw_ostream &O, const Twine &Indent,
2006              VPSlotTracker &SlotTracker) const override;
2007 #endif
2008 };
2009 
2010 /// A pure virtual base class for all recipes modeling header phis, including
2011 /// phis for first order recurrences, pointer inductions and reductions. The
2012 /// start value is the first operand of the recipe and the incoming value from
2013 /// the backedge is the second operand.
2014 ///
2015 /// Inductions are modeled using the following sub-classes:
2016 ///  * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
2017 ///    starting at a specified value (zero for the main vector loop, the resume
2018 ///    value for the epilogue vector loop) and stepping by 1. The induction
2019 ///    controls exiting of the vector loop by comparing against the vector trip
2020 ///    count. Produces a single scalar PHI for the induction value per
2021 ///    iteration.
2022 ///  * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
2023 ///    floating point inductions with arbitrary start and step values. Produces
2024 ///    a vector PHI per-part.
2025 ///  * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding
2026 ///    value of an IV with different start and step values. Produces a single
2027 ///    scalar value per iteration
2028 ///  * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a
2029 ///    canonical or derived induction.
2030 ///  * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
2031 ///    pointer induction. Produces either a vector PHI per-part or scalar values
2032 ///    per-lane based on the canonical induction.
2033 class VPHeaderPHIRecipe : public VPSingleDefRecipe {
2034 protected:
2035   VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
2036                     VPValue *Start = nullptr, DebugLoc DL = {})
2037       : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) {
2038     if (Start)
2039       addOperand(Start);
2040   }
2041 
2042 public:
2043   ~VPHeaderPHIRecipe() override = default;
2044 
2045   /// Method to support type inquiry through isa, cast, and dyn_cast.
2046   static inline bool classof(const VPRecipeBase *B) {
2047     return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC &&
2048            B->getVPDefID() <= VPDef::VPLastHeaderPHISC;
2049   }
2050   static inline bool classof(const VPValue *V) {
2051     auto *B = V->getDefiningRecipe();
2052     return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
2053            B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC;
2054   }
2055 
2056   /// Generate the phi nodes.
2057   void execute(VPTransformState &State) override = 0;
2058 
2059   /// Return the cost of this header phi recipe.
2060   InstructionCost computeCost(ElementCount VF,
2061                               VPCostContext &Ctx) const override;
2062 
2063 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2064   /// Print the recipe.
2065   void print(raw_ostream &O, const Twine &Indent,
2066              VPSlotTracker &SlotTracker) const override = 0;
2067 #endif
2068 
2069   /// Returns the start value of the phi, if one is set.
2070   VPValue *getStartValue() {
2071     return getNumOperands() == 0 ? nullptr : getOperand(0);
2072   }
2073   VPValue *getStartValue() const {
2074     return getNumOperands() == 0 ? nullptr : getOperand(0);
2075   }
2076 
2077   /// Update the start value of the recipe.
2078   void setStartValue(VPValue *V) { setOperand(0, V); }
2079 
2080   /// Returns the incoming value from the loop backedge.
2081   virtual VPValue *getBackedgeValue() {
2082     return getOperand(1);
2083   }
2084 
2085   /// Returns the backedge value as a recipe. The backedge value is guaranteed
2086   /// to be a recipe.
2087   virtual VPRecipeBase &getBackedgeRecipe() {
2088     return *getBackedgeValue()->getDefiningRecipe();
2089   }
2090 };
2091 
2092 /// Base class for widened induction (VPWidenIntOrFpInductionRecipe and
2093 /// VPWidenPointerInductionRecipe), providing shared functionality, including
2094 /// retrieving the step value, induction descriptor and original phi node.
2095 class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
2096   const InductionDescriptor &IndDesc;
2097 
2098 public:
2099   VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start,
2100                          VPValue *Step, const InductionDescriptor &IndDesc,
2101                          DebugLoc DL)
2102       : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) {
2103     addOperand(Step);
2104   }
2105 
2106   static inline bool classof(const VPRecipeBase *R) {
2107     return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC ||
2108            R->getVPDefID() == VPDef::VPWidenPointerInductionSC;
2109   }
2110 
2111   static inline bool classof(const VPValue *V) {
2112     auto *R = V->getDefiningRecipe();
2113     return R && classof(R);
2114   }
2115 
2116   static inline bool classof(const VPHeaderPHIRecipe *R) {
2117     return classof(static_cast<const VPRecipeBase *>(R));
2118   }
2119 
2120   virtual void execute(VPTransformState &State) override = 0;
2121 
2122   /// Returns the step value of the induction.
2123   VPValue *getStepValue() { return getOperand(1); }
2124   const VPValue *getStepValue() const { return getOperand(1); }
2125 
2126   PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); }
2127 
2128   /// Returns the induction descriptor for the recipe.
2129   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
2130 
2131   VPValue *getBackedgeValue() override {
2132     // TODO: All operands of base recipe must exist and be at same index in
2133     // derived recipe.
2134     llvm_unreachable(
2135         "VPWidenIntOrFpInductionRecipe generates its own backedge value");
2136   }
2137 
2138   VPRecipeBase &getBackedgeRecipe() override {
2139     // TODO: All operands of base recipe must exist and be at same index in
2140     // derived recipe.
2141     llvm_unreachable(
2142         "VPWidenIntOrFpInductionRecipe generates its own backedge value");
2143   }
2144 };
2145 
2146 /// A recipe for handling phi nodes of integer and floating-point inductions,
2147 /// producing their vector values.
2148 class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
2149   TruncInst *Trunc;
2150 
2151 public:
2152   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
2153                                 VPValue *VF, const InductionDescriptor &IndDesc,
2154                                 DebugLoc DL)
2155       : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
2156                                Step, IndDesc, DL),
2157         Trunc(nullptr) {
2158     addOperand(VF);
2159   }
2160 
2161   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
2162                                 VPValue *VF, const InductionDescriptor &IndDesc,
2163                                 TruncInst *Trunc, DebugLoc DL)
2164       : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
2165                                Step, IndDesc, DL),
2166         Trunc(Trunc) {
2167     addOperand(VF);
2168   }
2169 
2170   ~VPWidenIntOrFpInductionRecipe() override = default;
2171 
2172   VPWidenIntOrFpInductionRecipe *clone() override {
2173     return new VPWidenIntOrFpInductionRecipe(
2174         getPHINode(), getStartValue(), getStepValue(), getVFValue(),
2175         getInductionDescriptor(), Trunc, getDebugLoc());
2176   }
2177 
2178   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
2179 
2180   /// Generate the vectorized and scalarized versions of the phi node as
2181   /// needed by their users.
2182   void execute(VPTransformState &State) override;
2183 
2184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2185   /// Print the recipe.
2186   void print(raw_ostream &O, const Twine &Indent,
2187              VPSlotTracker &SlotTracker) const override;
2188 #endif
2189 
2190   VPValue *getVFValue() { return getOperand(2); }
2191   const VPValue *getVFValue() const { return getOperand(2); }
2192 
2193   VPValue *getSplatVFValue() {
2194     // If the recipe has been unrolled (4 operands), return the VPValue for the
2195     // induction increment.
2196     return getNumOperands() == 5 ? getOperand(3) : nullptr;
2197   }
2198 
2199   /// Returns the first defined value as TruncInst, if it is one or nullptr
2200   /// otherwise.
2201   TruncInst *getTruncInst() { return Trunc; }
2202   const TruncInst *getTruncInst() const { return Trunc; }
2203 
2204   /// Returns true if the induction is canonical, i.e. starting at 0 and
2205   /// incremented by UF * VF (= the original IV is incremented by 1) and has the
2206   /// same type as the canonical induction.
2207   bool isCanonical() const;
2208 
2209   /// Returns the scalar type of the induction.
2210   Type *getScalarType() const {
2211     return Trunc ? Trunc->getType() : getPHINode()->getType();
2212   }
2213 
2214   /// Returns the VPValue representing the value of this induction at
2215   /// the last unrolled part, if it exists. Returns itself if unrolling did not
2216   /// take place.
2217   VPValue *getLastUnrolledPartOperand() {
2218     return getNumOperands() == 5 ? getOperand(4) : this;
2219   }
2220 };
2221 
2222 class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
2223                                       public VPUnrollPartAccessor<3> {
2224   bool IsScalarAfterVectorization;
2225 
2226 public:
2227   /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
2228   /// Start.
2229   VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
2230                                 const InductionDescriptor &IndDesc,
2231                                 bool IsScalarAfterVectorization, DebugLoc DL)
2232       : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start,
2233                                Step, IndDesc, DL),
2234         IsScalarAfterVectorization(IsScalarAfterVectorization) {}
2235 
2236   ~VPWidenPointerInductionRecipe() override = default;
2237 
2238   VPWidenPointerInductionRecipe *clone() override {
2239     return new VPWidenPointerInductionRecipe(
2240         cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1),
2241         getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc());
2242   }
2243 
2244   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
2245 
2246   /// Generate vector values for the pointer induction.
2247   void execute(VPTransformState &State) override;
2248 
2249   /// Returns true if only scalar values will be generated.
2250   bool onlyScalarsGenerated(bool IsScalable);
2251 
2252   /// Returns the VPValue representing the value of this induction at
2253   /// the first unrolled part, if it exists. Returns itself if unrolling did not
2254   /// take place.
2255   VPValue *getFirstUnrolledPartOperand() {
2256     return getUnrollPart(*this) == 0 ? this : getOperand(2);
2257   }
2258 
2259 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2260   /// Print the recipe.
2261   void print(raw_ostream &O, const Twine &Indent,
2262              VPSlotTracker &SlotTracker) const override;
2263 #endif
2264 };
2265 
2266 /// Recipe to generate a scalar PHI. Used to generate code for recipes that
2267 /// produce scalar header phis, including VPCanonicalIVPHIRecipe and
2268 /// VPEVLBasedIVPHIRecipe.
2269 class VPScalarPHIRecipe : public VPHeaderPHIRecipe {
2270   std::string Name;
2271 
2272 public:
2273   VPScalarPHIRecipe(VPValue *Start, VPValue *BackedgeValue, DebugLoc DL,
2274                     StringRef Name)
2275       : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, nullptr, Start, DL),
2276         Name(Name.str()) {
2277     addOperand(BackedgeValue);
2278   }
2279 
2280   ~VPScalarPHIRecipe() override = default;
2281 
2282   VPScalarPHIRecipe *clone() override {
2283     llvm_unreachable("cloning not implemented yet");
2284   }
2285 
2286   VP_CLASSOF_IMPL(VPDef::VPScalarPHISC)
2287 
2288   /// Generate the phi/select nodes.
2289   void execute(VPTransformState &State) override;
2290 
2291   /// Returns true if the recipe only uses the first lane of operand \p Op.
2292   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2293     assert(is_contained(operands(), Op) &&
2294            "Op must be an operand of the recipe");
2295     return true;
2296   }
2297 
2298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2299   /// Print the recipe.
2300   void print(raw_ostream &O, const Twine &Indent,
2301              VPSlotTracker &SlotTracker) const override;
2302 #endif
2303 };
2304 
2305 /// A recipe for handling phis that are widened in the vector loop.
2306 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
2307 /// managed in the recipe directly.
2308 class VPWidenPHIRecipe : public VPSingleDefRecipe {
2309   /// List of incoming blocks. Only used in the VPlan native path.
2310   SmallVector<VPBasicBlock *, 2> IncomingBlocks;
2311 
2312 public:
2313   /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and
2314   /// debug location \p DL.
2315   VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {})
2316       : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL) {
2317     if (Start)
2318       addOperand(Start);
2319   }
2320 
2321   VPWidenPHIRecipe *clone() override {
2322     llvm_unreachable("cloning not implemented yet");
2323   }
2324 
2325   ~VPWidenPHIRecipe() override = default;
2326 
2327   VP_CLASSOF_IMPL(VPDef::VPWidenPHISC)
2328 
2329   /// Generate the phi/select nodes.
2330   void execute(VPTransformState &State) override;
2331 
2332 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2333   /// Print the recipe.
2334   void print(raw_ostream &O, const Twine &Indent,
2335              VPSlotTracker &SlotTracker) const override;
2336 #endif
2337 
2338   /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
2339   void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
2340     addOperand(IncomingV);
2341     IncomingBlocks.push_back(IncomingBlock);
2342   }
2343 
2344   /// Returns the \p I th incoming VPBasicBlock.
2345   VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
2346 
2347   /// Returns the \p I th incoming VPValue.
2348   VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
2349 };
2350 
2351 /// A recipe for handling first-order recurrence phis. The start value is the
2352 /// first operand of the recipe and the incoming value from the backedge is the
2353 /// second operand.
2354 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
2355   VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
2356       : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {}
2357 
2358   VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC)
2359 
2360   static inline bool classof(const VPHeaderPHIRecipe *R) {
2361     return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC;
2362   }
2363 
2364   VPFirstOrderRecurrencePHIRecipe *clone() override {
2365     return new VPFirstOrderRecurrencePHIRecipe(
2366         cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
2367   }
2368 
2369   void execute(VPTransformState &State) override;
2370 
2371   /// Return the cost of this first-order recurrence phi recipe.
2372   InstructionCost computeCost(ElementCount VF,
2373                               VPCostContext &Ctx) const override;
2374 
2375 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2376   /// Print the recipe.
2377   void print(raw_ostream &O, const Twine &Indent,
2378              VPSlotTracker &SlotTracker) const override;
2379 #endif
2380 };
2381 
2382 /// A recipe for handling reduction phis. The start value is the first operand
2383 /// of the recipe and the incoming value from the backedge is the second
2384 /// operand.
2385 class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
2386                              public VPUnrollPartAccessor<2> {
2387   /// Descriptor for the reduction.
2388   const RecurrenceDescriptor &RdxDesc;
2389 
2390   /// The phi is part of an in-loop reduction.
2391   bool IsInLoop;
2392 
2393   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
2394   bool IsOrdered;
2395 
2396   /// When expanding the reduction PHI, the plan's VF element count is divided
2397   /// by this factor to form the reduction phi's VF.
2398   unsigned VFScaleFactor = 1;
2399 
2400 public:
2401   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
2402   /// RdxDesc.
2403   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
2404                        VPValue &Start, bool IsInLoop = false,
2405                        bool IsOrdered = false, unsigned VFScaleFactor = 1)
2406       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
2407         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
2408         VFScaleFactor(VFScaleFactor) {
2409     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
2410   }
2411 
2412   ~VPReductionPHIRecipe() override = default;
2413 
2414   VPReductionPHIRecipe *clone() override {
2415     auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
2416                                        RdxDesc, *getOperand(0), IsInLoop,
2417                                        IsOrdered, VFScaleFactor);
2418     R->addOperand(getBackedgeValue());
2419     return R;
2420   }
2421 
2422   VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)
2423 
2424   static inline bool classof(const VPHeaderPHIRecipe *R) {
2425     return R->getVPDefID() == VPDef::VPReductionPHISC;
2426   }
2427 
2428   /// Generate the phi/select nodes.
2429   void execute(VPTransformState &State) override;
2430 
2431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2432   /// Print the recipe.
2433   void print(raw_ostream &O, const Twine &Indent,
2434              VPSlotTracker &SlotTracker) const override;
2435 #endif
2436 
2437   const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2438     return RdxDesc;
2439   }
2440 
2441   /// Returns true, if the phi is part of an ordered reduction.
2442   bool isOrdered() const { return IsOrdered; }
2443 
2444   /// Returns true, if the phi is part of an in-loop reduction.
2445   bool isInLoop() const { return IsInLoop; }
2446 };
2447 
2448 /// A recipe for forming partial reductions. In the loop, an accumulator and
2449 /// vector operand are added together and passed to the next iteration as the
2450 /// next accumulator. After the loop body, the accumulator is reduced to a
2451 /// scalar value.
2452 class VPPartialReductionRecipe : public VPSingleDefRecipe {
2453   unsigned Opcode;
2454 
2455 public:
2456   VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2457                            VPValue *Op1)
2458       : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
2459                                  ReductionInst) {}
2460   VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2461                            Instruction *ReductionInst = nullptr)
2462       : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
2463                           ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
2464         Opcode(Opcode) {
2465     [[maybe_unused]] auto *AccumulatorRecipe =
2466         getOperand(1)->getDefiningRecipe();
2467     assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
2468             isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
2469            "Unexpected operand order for partial reduction recipe");
2470   }
2471   ~VPPartialReductionRecipe() override = default;
2472 
2473   VPPartialReductionRecipe *clone() override {
2474     return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2475                                         getUnderlyingInstr());
2476   }
2477 
2478   VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
2479 
2480   /// Generate the reduction in the loop.
2481   void execute(VPTransformState &State) override;
2482 
2483   /// Return the cost of this VPPartialReductionRecipe.
2484   InstructionCost computeCost(ElementCount VF,
2485                               VPCostContext &Ctx) const override;
2486 
2487   /// Get the binary op's opcode.
2488   unsigned getOpcode() const { return Opcode; }
2489 
2490 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2491   /// Print the recipe.
2492   void print(raw_ostream &O, const Twine &Indent,
2493              VPSlotTracker &SlotTracker) const override;
2494 #endif
2495 };
2496 
2497 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
2498 /// instructions.
2499 class VPBlendRecipe : public VPSingleDefRecipe {
2500 public:
2501   /// The blend operation is a User of the incoming values and of their
2502   /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can
2503   /// be omitted (implied by passing an odd number of operands) in which case
2504   /// all other incoming values are merged into it.
2505   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
2506       : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
2507     assert(Operands.size() > 0 && "Expected at least one operand!");
2508   }
2509 
2510   VPBlendRecipe *clone() override {
2511     SmallVector<VPValue *> Ops(operands());
2512     return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops);
2513   }
2514 
2515   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
2516 
2517   /// A normalized blend is one that has an odd number of operands, whereby the
2518   /// first operand does not have an associated mask.
2519   bool isNormalized() const { return getNumOperands() % 2; }
2520 
2521   /// Return the number of incoming values, taking into account when normalized
2522   /// the first incoming value will have no mask.
2523   unsigned getNumIncomingValues() const {
2524     return (getNumOperands() + isNormalized()) / 2;
2525   }
2526 
2527   /// Return incoming value number \p Idx.
2528   VPValue *getIncomingValue(unsigned Idx) const {
2529     return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized());
2530   }
2531 
2532   /// Return mask number \p Idx.
2533   VPValue *getMask(unsigned Idx) const {
2534     assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
2535     return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
2536   }
2537 
2538   /// Generate the phi/select nodes.
2539   void execute(VPTransformState &State) override;
2540 
2541   /// Return the cost of this VPWidenMemoryRecipe.
2542   InstructionCost computeCost(ElementCount VF,
2543                               VPCostContext &Ctx) const override;
2544 
2545 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2546   /// Print the recipe.
2547   void print(raw_ostream &O, const Twine &Indent,
2548              VPSlotTracker &SlotTracker) const override;
2549 #endif
2550 
2551   /// Returns true if the recipe only uses the first lane of operand \p Op.
2552   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2553     assert(is_contained(operands(), Op) &&
2554            "Op must be an operand of the recipe");
2555     // Recursing through Blend recipes only, must terminate at header phi's the
2556     // latest.
2557     return all_of(users(),
2558                   [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
2559   }
2560 };
2561 
2562 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
2563 /// or stores into one wide load/store and shuffles. The first operand of a
2564 /// VPInterleave recipe is the address, followed by the stored values, followed
2565 /// by an optional mask.
2566 class VPInterleaveRecipe : public VPRecipeBase {
2567   const InterleaveGroup<Instruction> *IG;
2568 
2569   /// Indicates if the interleave group is in a conditional block and requires a
2570   /// mask.
2571   bool HasMask = false;
2572 
2573   /// Indicates if gaps between members of the group need to be masked out or if
2574   /// unusued gaps can be loaded speculatively.
2575   bool NeedsMaskForGaps = false;
2576 
2577 public:
2578   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
2579                      ArrayRef<VPValue *> StoredValues, VPValue *Mask,
2580                      bool NeedsMaskForGaps)
2581       : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG),
2582         NeedsMaskForGaps(NeedsMaskForGaps) {
2583     for (unsigned i = 0; i < IG->getFactor(); ++i)
2584       if (Instruction *I = IG->getMember(i)) {
2585         if (I->getType()->isVoidTy())
2586           continue;
2587         new VPValue(I, this);
2588       }
2589 
2590     for (auto *SV : StoredValues)
2591       addOperand(SV);
2592     if (Mask) {
2593       HasMask = true;
2594       addOperand(Mask);
2595     }
2596   }
2597   ~VPInterleaveRecipe() override = default;
2598 
2599   VPInterleaveRecipe *clone() override {
2600     return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(),
2601                                   NeedsMaskForGaps);
2602   }
2603 
2604   VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
2605 
2606   /// Return the address accessed by this recipe.
2607   VPValue *getAddr() const {
2608     return getOperand(0); // Address is the 1st, mandatory operand.
2609   }
2610 
2611   /// Return the mask used by this recipe. Note that a full mask is represented
2612   /// by a nullptr.
2613   VPValue *getMask() const {
2614     // Mask is optional and therefore the last, currently 2nd operand.
2615     return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
2616   }
2617 
2618   /// Return the VPValues stored by this interleave group. If it is a load
2619   /// interleave group, return an empty ArrayRef.
2620   ArrayRef<VPValue *> getStoredValues() const {
2621     // The first operand is the address, followed by the stored values, followed
2622     // by an optional mask.
2623     return ArrayRef<VPValue *>(op_begin(), getNumOperands())
2624         .slice(1, getNumStoreOperands());
2625   }
2626 
2627   /// Generate the wide load or store, and shuffles.
2628   void execute(VPTransformState &State) override;
2629 
2630   /// Return the cost of this VPInterleaveRecipe.
2631   InstructionCost computeCost(ElementCount VF,
2632                               VPCostContext &Ctx) const override;
2633 
2634 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2635   /// Print the recipe.
2636   void print(raw_ostream &O, const Twine &Indent,
2637              VPSlotTracker &SlotTracker) const override;
2638 #endif
2639 
2640   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
2641 
2642   /// Returns the number of stored operands of this interleave group. Returns 0
2643   /// for load interleave groups.
2644   unsigned getNumStoreOperands() const {
2645     return getNumOperands() - (HasMask ? 2 : 1);
2646   }
2647 
2648   /// The recipe only uses the first lane of the address.
2649   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2650     assert(is_contained(operands(), Op) &&
2651            "Op must be an operand of the recipe");
2652     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
2653   }
2654 
2655   Instruction *getInsertPos() const { return IG->getInsertPos(); }
2656 };
2657 
2658 /// A recipe to represent inloop reduction operations, performing a reduction on
2659 /// a vector operand into a scalar value, and adding the result to a chain.
2660 /// The Operands are {ChainOp, VecOp, [Condition]}.
2661 class VPReductionRecipe : public VPSingleDefRecipe {
2662   /// The recurrence decriptor for the reduction in question.
2663   const RecurrenceDescriptor &RdxDesc;
2664   bool IsOrdered;
2665   /// Whether the reduction is conditional.
2666   bool IsConditional = false;
2667 
2668 protected:
2669   VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
2670                     Instruction *I, ArrayRef<VPValue *> Operands,
2671                     VPValue *CondOp, bool IsOrdered, DebugLoc DL)
2672       : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R),
2673         IsOrdered(IsOrdered) {
2674     if (CondOp) {
2675       IsConditional = true;
2676       addOperand(CondOp);
2677     }
2678   }
2679 
2680 public:
2681   VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
2682                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
2683                     bool IsOrdered, DebugLoc DL = {})
2684       : VPReductionRecipe(VPDef::VPReductionSC, R, I,
2685                           ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2686                           IsOrdered, DL) {}
2687 
2688   ~VPReductionRecipe() override = default;
2689 
2690   VPReductionRecipe *clone() override {
2691     return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(),
2692                                  getVecOp(), getCondOp(), IsOrdered,
2693                                  getDebugLoc());
2694   }
2695 
2696   static inline bool classof(const VPRecipeBase *R) {
2697     return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2698            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
2699   }
2700 
2701   static inline bool classof(const VPUser *U) {
2702     auto *R = dyn_cast<VPRecipeBase>(U);
2703     return R && classof(R);
2704   }
2705 
2706   /// Generate the reduction in the loop.
2707   void execute(VPTransformState &State) override;
2708 
2709   /// Return the cost of VPReductionRecipe.
2710   InstructionCost computeCost(ElementCount VF,
2711                               VPCostContext &Ctx) const override;
2712 
2713 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2714   /// Print the recipe.
2715   void print(raw_ostream &O, const Twine &Indent,
2716              VPSlotTracker &SlotTracker) const override;
2717 #endif
2718 
2719   /// Return the recurrence decriptor for the in-loop reduction.
2720   const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2721     return RdxDesc;
2722   }
2723   /// Return true if the in-loop reduction is ordered.
2724   bool isOrdered() const { return IsOrdered; };
2725   /// Return true if the in-loop reduction is conditional.
2726   bool isConditional() const { return IsConditional; };
2727   /// The VPValue of the scalar Chain being accumulated.
2728   VPValue *getChainOp() const { return getOperand(0); }
2729   /// The VPValue of the vector value to be reduced.
2730   VPValue *getVecOp() const { return getOperand(1); }
2731   /// The VPValue of the condition for the block.
2732   VPValue *getCondOp() const {
2733     return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
2734   }
2735 };
2736 
2737 /// A recipe to represent inloop reduction operations with vector-predication
2738 /// intrinsics, performing a reduction on a vector operand with the explicit
2739 /// vector length (EVL) into a scalar value, and adding the result to a chain.
2740 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
2741 class VPReductionEVLRecipe : public VPReductionRecipe {
2742 public:
2743   VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp)
2744       : VPReductionRecipe(
2745             VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(),
2746             cast_or_null<Instruction>(R.getUnderlyingValue()),
2747             ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
2748             R.isOrdered(), R.getDebugLoc()) {}
2749 
2750   ~VPReductionEVLRecipe() override = default;
2751 
2752   VPReductionEVLRecipe *clone() override {
2753     llvm_unreachable("cloning not implemented yet");
2754   }
2755 
2756   VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
2757 
2758   /// Generate the reduction in the loop
2759   void execute(VPTransformState &State) override;
2760 
2761 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2762   /// Print the recipe.
2763   void print(raw_ostream &O, const Twine &Indent,
2764              VPSlotTracker &SlotTracker) const override;
2765 #endif
2766 
2767   /// The VPValue of the explicit vector length.
2768   VPValue *getEVL() const { return getOperand(2); }
2769 
2770   /// Returns true if the recipe only uses the first lane of operand \p Op.
2771   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2772     assert(is_contained(operands(), Op) &&
2773            "Op must be an operand of the recipe");
2774     return Op == getEVL();
2775   }
2776 };
2777 
2778 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
2779 /// copies of the original scalar type, one per lane, instead of producing a
2780 /// single copy of widened type for all lanes. If the instruction is known to be
2781 /// uniform only one copy, per lane zero, will be generated.
2782 class VPReplicateRecipe : public VPRecipeWithIRFlags {
2783   /// Indicator if only a single replica per lane is needed.
2784   bool IsUniform;
2785 
2786   /// Indicator if the replicas are also predicated.
2787   bool IsPredicated;
2788 
2789 public:
2790   template <typename IterT>
2791   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
2792                     bool IsUniform, VPValue *Mask = nullptr)
2793       : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
2794         IsUniform(IsUniform), IsPredicated(Mask) {
2795     if (Mask)
2796       addOperand(Mask);
2797   }
2798 
2799   ~VPReplicateRecipe() override = default;
2800 
2801   VPReplicateRecipe *clone() override {
2802     auto *Copy =
2803         new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform,
2804                               isPredicated() ? getMask() : nullptr);
2805     Copy->transferFlags(*this);
2806     return Copy;
2807   }
2808 
2809   VP_CLASSOF_IMPL(VPDef::VPReplicateSC)
2810 
2811   /// Generate replicas of the desired Ingredient. Replicas will be generated
2812   /// for all parts and lanes unless a specific part and lane are specified in
2813   /// the \p State.
2814   void execute(VPTransformState &State) override;
2815 
2816   /// Return the cost of this VPReplicateRecipe.
2817   InstructionCost computeCost(ElementCount VF,
2818                               VPCostContext &Ctx) const override;
2819 
2820 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2821   /// Print the recipe.
2822   void print(raw_ostream &O, const Twine &Indent,
2823              VPSlotTracker &SlotTracker) const override;
2824 #endif
2825 
2826   bool isUniform() const { return IsUniform; }
2827 
2828   bool isPredicated() const { return IsPredicated; }
2829 
2830   /// Returns true if the recipe only uses the first lane of operand \p Op.
2831   bool onlyFirstLaneUsed(const VPValue *Op) const override {
2832     assert(is_contained(operands(), Op) &&
2833            "Op must be an operand of the recipe");
2834     return isUniform();
2835   }
2836 
2837   /// Returns true if the recipe uses scalars of operand \p Op.
2838   bool usesScalars(const VPValue *Op) const override {
2839     assert(is_contained(operands(), Op) &&
2840            "Op must be an operand of the recipe");
2841     return true;
2842   }
2843 
2844   /// Returns true if the recipe is used by a widened recipe via an intervening
2845   /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed
2846   /// in a vector.
2847   bool shouldPack() const;
2848 
2849   /// Return the mask of a predicated VPReplicateRecipe.
2850   VPValue *getMask() {
2851     assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
2852     return getOperand(getNumOperands() - 1);
2853   }
2854 
2855   unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
2856 };
2857 
2858 /// A recipe for generating conditional branches on the bits of a mask.
2859 class VPBranchOnMaskRecipe : public VPRecipeBase {
2860 public:
2861   VPBranchOnMaskRecipe(VPValue *BlockInMask)
2862       : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) {
2863     if (BlockInMask) // nullptr means all-one mask.
2864       addOperand(BlockInMask);
2865   }
2866 
2867   VPBranchOnMaskRecipe *clone() override {
2868     return new VPBranchOnMaskRecipe(getOperand(0));
2869   }
2870 
2871   VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC)
2872 
2873   /// Generate the extraction of the appropriate bit from the block mask and the
2874   /// conditional branch.
2875   void execute(VPTransformState &State) override;
2876 
2877   /// Return the cost of this VPBranchOnMaskRecipe.
2878   InstructionCost computeCost(ElementCount VF,
2879                               VPCostContext &Ctx) const override;
2880 
2881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2882   /// Print the recipe.
2883   void print(raw_ostream &O, const Twine &Indent,
2884              VPSlotTracker &SlotTracker) const override {
2885     O << Indent << "BRANCH-ON-MASK ";
2886     if (VPValue *Mask = getMask())
2887       Mask->printAsOperand(O, SlotTracker);
2888     else
2889       O << " All-One";
2890   }
2891 #endif
2892 
2893   /// Return the mask used by this recipe. Note that a full mask is represented
2894   /// by a nullptr.
2895   VPValue *getMask() const {
2896     assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
2897     // Mask is optional.
2898     return getNumOperands() == 1 ? getOperand(0) : nullptr;
2899   }
2900 
2901   /// Returns true if the recipe uses scalars of operand \p Op.
2902   bool usesScalars(const VPValue *Op) const override {
2903     assert(is_contained(operands(), Op) &&
2904            "Op must be an operand of the recipe");
2905     return true;
2906   }
2907 };
2908 
2909 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
2910 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
2911 /// order to merge values that are set under such a branch and feed their uses.
2912 /// The phi nodes can be scalar or vector depending on the users of the value.
2913 /// This recipe works in concert with VPBranchOnMaskRecipe.
2914 class VPPredInstPHIRecipe : public VPSingleDefRecipe {
2915 public:
2916   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
2917   /// nodes after merging back from a Branch-on-Mask.
2918   VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL)
2919       : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {}
2920   ~VPPredInstPHIRecipe() override = default;
2921 
2922   VPPredInstPHIRecipe *clone() override {
2923     return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc());
2924   }
2925 
2926   VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
2927 
2928   /// Generates phi nodes for live-outs (from a replicate region) as needed to
2929   /// retain SSA form.
2930   void execute(VPTransformState &State) override;
2931 
2932   /// Return the cost of this VPPredInstPHIRecipe.
2933   InstructionCost computeCost(ElementCount VF,
2934                               VPCostContext &Ctx) const override {
2935     // TODO: Compute accurate cost after retiring the legacy cost model.
2936     return 0;
2937   }
2938 
2939 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2940   /// Print the recipe.
2941   void print(raw_ostream &O, const Twine &Indent,
2942              VPSlotTracker &SlotTracker) const override;
2943 #endif
2944 
2945   /// Returns true if the recipe uses scalars of operand \p Op.
2946   bool usesScalars(const VPValue *Op) const override {
2947     assert(is_contained(operands(), Op) &&
2948            "Op must be an operand of the recipe");
2949     return true;
2950   }
2951 };
2952 
2953 /// A common base class for widening memory operations. An optional mask can be
2954 /// provided as the last operand.
2955 class VPWidenMemoryRecipe : public VPRecipeBase {
2956 protected:
2957   Instruction &Ingredient;
2958 
2959   /// Whether the accessed addresses are consecutive.
2960   bool Consecutive;
2961 
2962   /// Whether the consecutive accessed addresses are in reverse order.
2963   bool Reverse;
2964 
2965   /// Whether the memory access is masked.
2966   bool IsMasked = false;
2967 
2968   void setMask(VPValue *Mask) {
2969     assert(!IsMasked && "cannot re-set mask");
2970     if (!Mask)
2971       return;
2972     addOperand(Mask);
2973     IsMasked = true;
2974   }
2975 
2976   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
2977                       std::initializer_list<VPValue *> Operands,
2978                       bool Consecutive, bool Reverse, DebugLoc DL)
2979       : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
2980         Reverse(Reverse) {
2981     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
2982   }
2983 
2984 public:
2985   VPWidenMemoryRecipe *clone() override {
2986     llvm_unreachable("cloning not supported");
2987   }
2988 
2989   static inline bool classof(const VPRecipeBase *R) {
2990     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
2991            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
2992            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
2993            R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
2994   }
2995 
2996   static inline bool classof(const VPUser *U) {
2997     auto *R = dyn_cast<VPRecipeBase>(U);
2998     return R && classof(R);
2999   }
3000 
3001   /// Return whether the loaded-from / stored-to addresses are consecutive.
3002   bool isConsecutive() const { return Consecutive; }
3003 
3004   /// Return whether the consecutive loaded/stored addresses are in reverse
3005   /// order.
3006   bool isReverse() const { return Reverse; }
3007 
3008   /// Return the address accessed by this recipe.
3009   VPValue *getAddr() const { return getOperand(0); }
3010 
3011   /// Returns true if the recipe is masked.
3012   bool isMasked() const { return IsMasked; }
3013 
3014   /// Return the mask used by this recipe. Note that a full mask is represented
3015   /// by a nullptr.
3016   VPValue *getMask() const {
3017     // Mask is optional and therefore the last operand.
3018     return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
3019   }
3020 
3021   /// Generate the wide load/store.
3022   void execute(VPTransformState &State) override {
3023     llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
3024   }
3025 
3026   /// Return the cost of this VPWidenMemoryRecipe.
3027   InstructionCost computeCost(ElementCount VF,
3028                               VPCostContext &Ctx) const override;
3029 
3030   Instruction &getIngredient() const { return Ingredient; }
3031 };
3032 
3033 /// A recipe for widening load operations, using the address to load from and an
3034 /// optional mask.
3035 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
3036   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
3037                     bool Consecutive, bool Reverse, DebugLoc DL)
3038       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
3039                             Reverse, DL),
3040         VPValue(this, &Load) {
3041     setMask(Mask);
3042   }
3043 
3044   VPWidenLoadRecipe *clone() override {
3045     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
3046                                  getMask(), Consecutive, Reverse,
3047                                  getDebugLoc());
3048   }
3049 
3050   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
3051 
3052   /// Generate a wide load or gather.
3053   void execute(VPTransformState &State) override;
3054 
3055 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3056   /// Print the recipe.
3057   void print(raw_ostream &O, const Twine &Indent,
3058              VPSlotTracker &SlotTracker) const override;
3059 #endif
3060 
3061   /// Returns true if the recipe only uses the first lane of operand \p Op.
3062   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3063     assert(is_contained(operands(), Op) &&
3064            "Op must be an operand of the recipe");
3065     // Widened, consecutive loads operations only demand the first lane of
3066     // their address.
3067     return Op == getAddr() && isConsecutive();
3068   }
3069 };
3070 
3071 /// A recipe for widening load operations with vector-predication intrinsics,
3072 /// using the address to load from, the explicit vector length and an optional
3073 /// mask.
3074 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
3075   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
3076       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
3077                             {L.getAddr(), &EVL}, L.isConsecutive(),
3078                             L.isReverse(), L.getDebugLoc()),
3079         VPValue(this, &getIngredient()) {
3080     setMask(Mask);
3081   }
3082 
3083   VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC)
3084 
3085   /// Return the EVL operand.
3086   VPValue *getEVL() const { return getOperand(1); }
3087 
3088   /// Generate the wide load or gather.
3089   void execute(VPTransformState &State) override;
3090 
3091   /// Return the cost of this VPWidenLoadEVLRecipe.
3092   InstructionCost computeCost(ElementCount VF,
3093                               VPCostContext &Ctx) const override;
3094 
3095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3096   /// Print the recipe.
3097   void print(raw_ostream &O, const Twine &Indent,
3098              VPSlotTracker &SlotTracker) const override;
3099 #endif
3100 
3101   /// Returns true if the recipe only uses the first lane of operand \p Op.
3102   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3103     assert(is_contained(operands(), Op) &&
3104            "Op must be an operand of the recipe");
3105     // Widened loads only demand the first lane of EVL and consecutive loads
3106     // only demand the first lane of their address.
3107     return Op == getEVL() || (Op == getAddr() && isConsecutive());
3108   }
3109 };
3110 
3111 /// A recipe for widening store operations, using the stored value, the address
3112 /// to store to and an optional mask.
3113 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
3114   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
3115                      VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
3116       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
3117                             Consecutive, Reverse, DL) {
3118     setMask(Mask);
3119   }
3120 
3121   VPWidenStoreRecipe *clone() override {
3122     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
3123                                   getStoredValue(), getMask(), Consecutive,
3124                                   Reverse, getDebugLoc());
3125   }
3126 
3127   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
3128 
3129   /// Return the value stored by this recipe.
3130   VPValue *getStoredValue() const { return getOperand(1); }
3131 
3132   /// Generate a wide store or scatter.
3133   void execute(VPTransformState &State) override;
3134 
3135 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3136   /// Print the recipe.
3137   void print(raw_ostream &O, const Twine &Indent,
3138              VPSlotTracker &SlotTracker) const override;
3139 #endif
3140 
3141   /// Returns true if the recipe only uses the first lane of operand \p Op.
3142   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3143     assert(is_contained(operands(), Op) &&
3144            "Op must be an operand of the recipe");
3145     // Widened, consecutive stores only demand the first lane of their address,
3146     // unless the same operand is also stored.
3147     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
3148   }
3149 };
3150 
3151 /// A recipe for widening store operations with vector-predication intrinsics,
3152 /// using the value to store, the address to store to, the explicit vector
3153 /// length and an optional mask.
3154 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
3155   VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
3156       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
3157                             {S.getAddr(), S.getStoredValue(), &EVL},
3158                             S.isConsecutive(), S.isReverse(), S.getDebugLoc()) {
3159     setMask(Mask);
3160   }
3161 
3162   VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)
3163 
3164   /// Return the address accessed by this recipe.
3165   VPValue *getStoredValue() const { return getOperand(1); }
3166 
3167   /// Return the EVL operand.
3168   VPValue *getEVL() const { return getOperand(2); }
3169 
3170   /// Generate the wide store or scatter.
3171   void execute(VPTransformState &State) override;
3172 
3173   /// Return the cost of this VPWidenStoreEVLRecipe.
3174   InstructionCost computeCost(ElementCount VF,
3175                               VPCostContext &Ctx) const override;
3176 
3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3178   /// Print the recipe.
3179   void print(raw_ostream &O, const Twine &Indent,
3180              VPSlotTracker &SlotTracker) const override;
3181 #endif
3182 
3183   /// Returns true if the recipe only uses the first lane of operand \p Op.
3184   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3185     assert(is_contained(operands(), Op) &&
3186            "Op must be an operand of the recipe");
3187     if (Op == getEVL()) {
3188       assert(getStoredValue() != Op && "unexpected store of EVL");
3189       return true;
3190     }
3191     // Widened, consecutive memory operations only demand the first lane of
3192     // their address, unless the same operand is also stored. That latter can
3193     // happen with opaque pointers.
3194     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
3195   }
3196 };
3197 
3198 /// Recipe to expand a SCEV expression.
3199 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
3200   const SCEV *Expr;
3201   ScalarEvolution &SE;
3202 
3203 public:
3204   VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
3205       : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {}
3206 
3207   ~VPExpandSCEVRecipe() override = default;
3208 
3209   VPExpandSCEVRecipe *clone() override {
3210     return new VPExpandSCEVRecipe(Expr, SE);
3211   }
3212 
3213   VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC)
3214 
3215   /// Generate a canonical vector induction variable of the vector loop, with
3216   void execute(VPTransformState &State) override;
3217 
3218   /// Return the cost of this VPExpandSCEVRecipe.
3219   InstructionCost computeCost(ElementCount VF,
3220                               VPCostContext &Ctx) const override {
3221     // TODO: Compute accurate cost after retiring the legacy cost model.
3222     return 0;
3223   }
3224 
3225 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3226   /// Print the recipe.
3227   void print(raw_ostream &O, const Twine &Indent,
3228              VPSlotTracker &SlotTracker) const override;
3229 #endif
3230 
3231   const SCEV *getSCEV() const { return Expr; }
3232 };
3233 
3234 /// Canonical scalar induction phi of the vector loop. Starting at the specified
3235 /// start value (either 0 or the resume value when vectorizing the epilogue
3236 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the
3237 /// canonical induction variable.
3238 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
3239 public:
3240   VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
3241       : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {}
3242 
3243   ~VPCanonicalIVPHIRecipe() override = default;
3244 
3245   VPCanonicalIVPHIRecipe *clone() override {
3246     auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
3247     R->addOperand(getBackedgeValue());
3248     return R;
3249   }
3250 
3251   VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
3252 
3253   static inline bool classof(const VPHeaderPHIRecipe *D) {
3254     return D->getVPDefID() == VPDef::VPCanonicalIVPHISC;
3255   }
3256 
3257   void execute(VPTransformState &State) override {
3258     llvm_unreachable(
3259         "cannot execute this recipe, should be replaced by VPScalarPHIRecipe");
3260   }
3261 
3262 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3263   /// Print the recipe.
3264   void print(raw_ostream &O, const Twine &Indent,
3265              VPSlotTracker &SlotTracker) const override;
3266 #endif
3267 
3268   /// Returns the scalar type of the induction.
3269   Type *getScalarType() const {
3270     return getStartValue()->getLiveInIRValue()->getType();
3271   }
3272 
3273   /// Returns true if the recipe only uses the first lane of operand \p Op.
3274   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3275     assert(is_contained(operands(), Op) &&
3276            "Op must be an operand of the recipe");
3277     return true;
3278   }
3279 
3280   /// Returns true if the recipe only uses the first part of operand \p Op.
3281   bool onlyFirstPartUsed(const VPValue *Op) const override {
3282     assert(is_contained(operands(), Op) &&
3283            "Op must be an operand of the recipe");
3284     return true;
3285   }
3286 
3287   /// Return the cost of this VPCanonicalIVPHIRecipe.
3288   InstructionCost computeCost(ElementCount VF,
3289                               VPCostContext &Ctx) const override {
3290     // For now, match the behavior of the legacy cost model.
3291     return 0;
3292   }
3293 };
3294 
3295 /// A recipe for generating the active lane mask for the vector loop that is
3296 /// used to predicate the vector operations.
3297 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3298 /// remove VPActiveLaneMaskPHIRecipe.
3299 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
3300 public:
3301   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
3302       : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask,
3303                           DL) {}
3304 
3305   ~VPActiveLaneMaskPHIRecipe() override = default;
3306 
3307   VPActiveLaneMaskPHIRecipe *clone() override {
3308     auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
3309     if (getNumOperands() == 2)
3310       R->addOperand(getOperand(1));
3311     return R;
3312   }
3313 
3314   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
3315 
3316   static inline bool classof(const VPHeaderPHIRecipe *D) {
3317     return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC;
3318   }
3319 
3320   /// Generate the active lane mask phi of the vector loop.
3321   void execute(VPTransformState &State) override;
3322 
3323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3324   /// Print the recipe.
3325   void print(raw_ostream &O, const Twine &Indent,
3326              VPSlotTracker &SlotTracker) const override;
3327 #endif
3328 };
3329 
3330 /// A recipe for generating the phi node for the current index of elements,
3331 /// adjusted in accordance with EVL value. It starts at the start value of the
3332 /// canonical induction and gets incremented by EVL in each iteration of the
3333 /// vector loop.
3334 class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
3335 public:
3336   VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
3337       : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
3338 
3339   ~VPEVLBasedIVPHIRecipe() override = default;
3340 
3341   VPEVLBasedIVPHIRecipe *clone() override {
3342     llvm_unreachable("cloning not implemented yet");
3343   }
3344 
3345   VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
3346 
3347   static inline bool classof(const VPHeaderPHIRecipe *D) {
3348     return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
3349   }
3350 
3351   void execute(VPTransformState &State) override {
3352     llvm_unreachable(
3353         "cannot execute this recipe, should be replaced by VPScalarPHIRecipe");
3354   }
3355 
3356   /// Return the cost of this VPEVLBasedIVPHIRecipe.
3357   InstructionCost computeCost(ElementCount VF,
3358                               VPCostContext &Ctx) const override {
3359     // For now, match the behavior of the legacy cost model.
3360     return 0;
3361   }
3362 
3363   /// Returns true if the recipe only uses the first lane of operand \p Op.
3364   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3365     assert(is_contained(operands(), Op) &&
3366            "Op must be an operand of the recipe");
3367     return true;
3368   }
3369 
3370 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3371   /// Print the recipe.
3372   void print(raw_ostream &O, const Twine &Indent,
3373              VPSlotTracker &SlotTracker) const override;
3374 #endif
3375 };
3376 
3377 /// A Recipe for widening the canonical induction variable of the vector loop.
3378 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
3379                                  public VPUnrollPartAccessor<1> {
3380 public:
3381   VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
3382       : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
3383 
3384   ~VPWidenCanonicalIVRecipe() override = default;
3385 
3386   VPWidenCanonicalIVRecipe *clone() override {
3387     return new VPWidenCanonicalIVRecipe(
3388         cast<VPCanonicalIVPHIRecipe>(getOperand(0)));
3389   }
3390 
3391   VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
3392 
3393   /// Generate a canonical vector induction variable of the vector loop, with
3394   /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
3395   /// step = <VF*UF, VF*UF, ..., VF*UF>.
3396   void execute(VPTransformState &State) override;
3397 
3398   /// Return the cost of this VPWidenCanonicalIVPHIRecipe.
3399   InstructionCost computeCost(ElementCount VF,
3400                               VPCostContext &Ctx) const override {
3401     // TODO: Compute accurate cost after retiring the legacy cost model.
3402     return 0;
3403   }
3404 
3405 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3406   /// Print the recipe.
3407   void print(raw_ostream &O, const Twine &Indent,
3408              VPSlotTracker &SlotTracker) const override;
3409 #endif
3410 };
3411 
3412 /// A recipe for converting the input value \p IV value to the corresponding
3413 /// value of an IV with different start and step values, using Start + IV *
3414 /// Step.
3415 class VPDerivedIVRecipe : public VPSingleDefRecipe {
3416   /// Kind of the induction.
3417   const InductionDescriptor::InductionKind Kind;
3418   /// If not nullptr, the floating point induction binary operator. Must be set
3419   /// for floating point inductions.
3420   const FPMathOperator *FPBinOp;
3421 
3422   /// Name to use for the generated IR instruction for the derived IV.
3423   std::string Name;
3424 
3425 public:
3426   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
3427                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
3428                     const Twine &Name = "")
3429       : VPDerivedIVRecipe(
3430             IndDesc.getKind(),
3431             dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
3432             Start, CanonicalIV, Step, Name) {}
3433 
3434   VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
3435                     const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
3436                     VPValue *Step, const Twine &Name = "")
3437       : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
3438         FPBinOp(FPBinOp), Name(Name.str()) {}
3439 
3440   ~VPDerivedIVRecipe() override = default;
3441 
3442   VPDerivedIVRecipe *clone() override {
3443     return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
3444                                  getStepValue());
3445   }
3446 
3447   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
3448 
3449   /// Generate the transformed value of the induction at offset StartValue (1.
3450   /// operand) + IV (2. operand) * StepValue (3, operand).
3451   void execute(VPTransformState &State) override;
3452 
3453   /// Return the cost of this VPDerivedIVRecipe.
3454   InstructionCost computeCost(ElementCount VF,
3455                               VPCostContext &Ctx) const override {
3456     // TODO: Compute accurate cost after retiring the legacy cost model.
3457     return 0;
3458   }
3459 
3460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3461   /// Print the recipe.
3462   void print(raw_ostream &O, const Twine &Indent,
3463              VPSlotTracker &SlotTracker) const override;
3464 #endif
3465 
3466   Type *getScalarType() const {
3467     return getStartValue()->getLiveInIRValue()->getType();
3468   }
3469 
3470   VPValue *getStartValue() const { return getOperand(0); }
3471   VPValue *getStepValue() const { return getOperand(2); }
3472 
3473   /// Returns true if the recipe only uses the first lane of operand \p Op.
3474   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3475     assert(is_contained(operands(), Op) &&
3476            "Op must be an operand of the recipe");
3477     return true;
3478   }
3479 };
3480 
3481 /// A recipe for handling phi nodes of integer and floating-point inductions,
3482 /// producing their scalar values.
3483 class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
3484                               public VPUnrollPartAccessor<2> {
3485   Instruction::BinaryOps InductionOpcode;
3486 
3487 public:
3488   VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,
3489                         Instruction::BinaryOps Opcode, FastMathFlags FMFs)
3490       : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
3491                             ArrayRef<VPValue *>({IV, Step}), FMFs),
3492         InductionOpcode(Opcode) {}
3493 
3494   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
3495                         VPValue *Step)
3496       : VPScalarIVStepsRecipe(
3497             IV, Step, IndDesc.getInductionOpcode(),
3498             dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())
3499                 ? IndDesc.getInductionBinOp()->getFastMathFlags()
3500                 : FastMathFlags()) {}
3501 
3502   ~VPScalarIVStepsRecipe() override = default;
3503 
3504   VPScalarIVStepsRecipe *clone() override {
3505     return new VPScalarIVStepsRecipe(
3506         getOperand(0), getOperand(1), InductionOpcode,
3507         hasFastMathFlags() ? getFastMathFlags() : FastMathFlags());
3508   }
3509 
3510   VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
3511 
3512   /// Generate the scalarized versions of the phi node as needed by their users.
3513   void execute(VPTransformState &State) override;
3514 
3515   /// Return the cost of this VPScalarIVStepsRecipe.
3516   InstructionCost computeCost(ElementCount VF,
3517                               VPCostContext &Ctx) const override {
3518     // TODO: Compute accurate cost after retiring the legacy cost model.
3519     return 0;
3520   }
3521 
3522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3523   /// Print the recipe.
3524   void print(raw_ostream &O, const Twine &Indent,
3525              VPSlotTracker &SlotTracker) const override;
3526 #endif
3527 
3528   VPValue *getStepValue() const { return getOperand(1); }
3529 
3530   /// Returns true if the recipe only uses the first lane of operand \p Op.
3531   bool onlyFirstLaneUsed(const VPValue *Op) const override {
3532     assert(is_contained(operands(), Op) &&
3533            "Op must be an operand of the recipe");
3534     return true;
3535   }
3536 };
3537 
3538 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
3539 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
3540 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
3541 class VPBasicBlock : public VPBlockBase {
3542   friend class VPlan;
3543 
3544   /// Use VPlan::createVPBasicBlock to create VPBasicBlocks.
3545   VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
3546       : VPBlockBase(VPBasicBlockSC, Name.str()) {
3547     if (Recipe)
3548       appendRecipe(Recipe);
3549   }
3550 
3551 public:
3552   using RecipeListTy = iplist<VPRecipeBase>;
3553 
3554 protected:
3555   /// The VPRecipes held in the order of output instructions to generate.
3556   RecipeListTy Recipes;
3557 
3558   VPBasicBlock(const unsigned char BlockSC, const Twine &Name = "")
3559       : VPBlockBase(BlockSC, Name.str()) {}
3560 
3561 public:
3562   ~VPBasicBlock() override {
3563     while (!Recipes.empty())
3564       Recipes.pop_back();
3565   }
3566 
3567   /// Instruction iterators...
3568   using iterator = RecipeListTy::iterator;
3569   using const_iterator = RecipeListTy::const_iterator;
3570   using reverse_iterator = RecipeListTy::reverse_iterator;
3571   using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
3572 
3573   //===--------------------------------------------------------------------===//
3574   /// Recipe iterator methods
3575   ///
3576   inline iterator begin() { return Recipes.begin(); }
3577   inline const_iterator begin() const { return Recipes.begin(); }
3578   inline iterator end() { return Recipes.end(); }
3579   inline const_iterator end() const { return Recipes.end(); }
3580 
3581   inline reverse_iterator rbegin() { return Recipes.rbegin(); }
3582   inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
3583   inline reverse_iterator rend() { return Recipes.rend(); }
3584   inline const_reverse_iterator rend() const { return Recipes.rend(); }
3585 
3586   inline size_t size() const { return Recipes.size(); }
3587   inline bool empty() const { return Recipes.empty(); }
3588   inline const VPRecipeBase &front() const { return Recipes.front(); }
3589   inline VPRecipeBase &front() { return Recipes.front(); }
3590   inline const VPRecipeBase &back() const { return Recipes.back(); }
3591   inline VPRecipeBase &back() { return Recipes.back(); }
3592 
3593   /// Returns a reference to the list of recipes.
3594   RecipeListTy &getRecipeList() { return Recipes; }
3595 
3596   /// Returns a pointer to a member of the recipe list.
3597   static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
3598     return &VPBasicBlock::Recipes;
3599   }
3600 
3601   /// Method to support type inquiry through isa, cast, and dyn_cast.
3602   static inline bool classof(const VPBlockBase *V) {
3603     return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC ||
3604            V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3605   }
3606 
3607   void insert(VPRecipeBase *Recipe, iterator InsertPt) {
3608     assert(Recipe && "No recipe to append.");
3609     assert(!Recipe->Parent && "Recipe already in VPlan");
3610     Recipe->Parent = this;
3611     Recipes.insert(InsertPt, Recipe);
3612   }
3613 
3614   /// Augment the existing recipes of a VPBasicBlock with an additional
3615   /// \p Recipe as the last recipe.
3616   void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
3617 
3618   /// The method which generates the output IR instructions that correspond to
3619   /// this VPBasicBlock, thereby "executing" the VPlan.
3620   void execute(VPTransformState *State) override;
3621 
3622   /// Return the cost of this VPBasicBlock.
3623   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3624 
3625   /// Return the position of the first non-phi node recipe in the block.
3626   iterator getFirstNonPhi();
3627 
3628   /// Returns an iterator range over the PHI-like recipes in the block.
3629   iterator_range<iterator> phis() {
3630     return make_range(begin(), getFirstNonPhi());
3631   }
3632 
3633   /// Split current block at \p SplitAt by inserting a new block between the
3634   /// current block and its successors and moving all recipes starting at
3635   /// SplitAt to the new block. Returns the new block.
3636   VPBasicBlock *splitAt(iterator SplitAt);
3637 
3638   VPRegionBlock *getEnclosingLoopRegion();
3639   const VPRegionBlock *getEnclosingLoopRegion() const;
3640 
3641 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3642   /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
3643   /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
3644   ///
3645   /// Note that the numbering is applied to the whole VPlan, so printing
3646   /// individual blocks is consistent with the whole VPlan printing.
3647   void print(raw_ostream &O, const Twine &Indent,
3648              VPSlotTracker &SlotTracker) const override;
3649   using VPBlockBase::print; // Get the print(raw_stream &O) version.
3650 #endif
3651 
3652   /// If the block has multiple successors, return the branch recipe terminating
3653   /// the block. If there are no or only a single successor, return nullptr;
3654   VPRecipeBase *getTerminator();
3655   const VPRecipeBase *getTerminator() const;
3656 
3657   /// Returns true if the block is exiting it's parent region.
3658   bool isExiting() const;
3659 
3660   /// Clone the current block and it's recipes, without updating the operands of
3661   /// the cloned recipes.
3662   VPBasicBlock *clone() override;
3663 
3664 protected:
3665   /// Execute the recipes in the IR basic block \p BB.
3666   void executeRecipes(VPTransformState *State, BasicBlock *BB);
3667 
3668   /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block
3669   /// generated for this VPBB.
3670   void connectToPredecessors(VPTransformState::CFGState &CFG);
3671 
3672 private:
3673   /// Create an IR BasicBlock to hold the output instructions generated by this
3674   /// VPBasicBlock, and return it. Update the CFGState accordingly.
3675   BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
3676 };
3677 
3678 /// A special type of VPBasicBlock that wraps an existing IR basic block.
3679 /// Recipes of the block get added before the first non-phi instruction in the
3680 /// wrapped block.
3681 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's
3682 /// preheader block.
3683 class VPIRBasicBlock : public VPBasicBlock {
3684   friend class VPlan;
3685 
3686   BasicBlock *IRBB;
3687 
3688   /// Use VPlan::createVPIRBasicBlock to create VPIRBasicBlocks.
3689   VPIRBasicBlock(BasicBlock *IRBB)
3690       : VPBasicBlock(VPIRBasicBlockSC,
3691                      (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()),
3692         IRBB(IRBB) {}
3693 
3694 public:
3695   ~VPIRBasicBlock() override {}
3696 
3697   static inline bool classof(const VPBlockBase *V) {
3698     return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC;
3699   }
3700 
3701   /// The method which generates the output IR instructions that correspond to
3702   /// this VPBasicBlock, thereby "executing" the VPlan.
3703   void execute(VPTransformState *State) override;
3704 
3705   VPIRBasicBlock *clone() override;
3706 
3707   BasicBlock *getIRBasicBlock() const { return IRBB; }
3708 };
3709 
3710 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
3711 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
3712 /// A VPRegionBlock may indicate that its contents are to be replicated several
3713 /// times. This is designed to support predicated scalarization, in which a
3714 /// scalar if-then code structure needs to be generated VF * UF times. Having
3715 /// this replication indicator helps to keep a single model for multiple
3716 /// candidate VF's. The actual replication takes place only once the desired VF
3717 /// and UF have been determined.
3718 class VPRegionBlock : public VPBlockBase {
3719   friend class VPlan;
3720 
3721   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
3722   VPBlockBase *Entry;
3723 
3724   /// Hold the Single Exiting block of the SESE region modelled by the
3725   /// VPRegionBlock.
3726   VPBlockBase *Exiting;
3727 
3728   /// An indicator whether this region is to generate multiple replicated
3729   /// instances of output IR corresponding to its VPBlockBases.
3730   bool IsReplicator;
3731 
3732   /// Use VPlan::createVPRegionBlock to create VPRegionBlocks.
3733   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
3734                 const std::string &Name = "", bool IsReplicator = false)
3735       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
3736         IsReplicator(IsReplicator) {
3737     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
3738     assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
3739     Entry->setParent(this);
3740     Exiting->setParent(this);
3741   }
3742   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
3743       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
3744         IsReplicator(IsReplicator) {}
3745 
3746 public:
3747   ~VPRegionBlock() override {}
3748 
3749   /// Method to support type inquiry through isa, cast, and dyn_cast.
3750   static inline bool classof(const VPBlockBase *V) {
3751     return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
3752   }
3753 
3754   const VPBlockBase *getEntry() const { return Entry; }
3755   VPBlockBase *getEntry() { return Entry; }
3756 
3757   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
3758   /// EntryBlock must have no predecessors.
3759   void setEntry(VPBlockBase *EntryBlock) {
3760     assert(EntryBlock->getPredecessors().empty() &&
3761            "Entry block cannot have predecessors.");
3762     Entry = EntryBlock;
3763     EntryBlock->setParent(this);
3764   }
3765 
3766   const VPBlockBase *getExiting() const { return Exiting; }
3767   VPBlockBase *getExiting() { return Exiting; }
3768 
3769   /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
3770   /// ExitingBlock must have no successors.
3771   void setExiting(VPBlockBase *ExitingBlock) {
3772     assert(ExitingBlock->getSuccessors().empty() &&
3773            "Exit block cannot have successors.");
3774     Exiting = ExitingBlock;
3775     ExitingBlock->setParent(this);
3776   }
3777 
3778   /// Returns the pre-header VPBasicBlock of the loop region.
3779   VPBasicBlock *getPreheaderVPBB() {
3780     assert(!isReplicator() && "should only get pre-header of loop regions");
3781     return getSinglePredecessor()->getExitingBasicBlock();
3782   }
3783 
3784   /// An indicator whether this region is to generate multiple replicated
3785   /// instances of output IR corresponding to its VPBlockBases.
3786   bool isReplicator() const { return IsReplicator; }
3787 
3788   /// The method which generates the output IR instructions that correspond to
3789   /// this VPRegionBlock, thereby "executing" the VPlan.
3790   void execute(VPTransformState *State) override;
3791 
3792   // Return the cost of this region.
3793   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
3794 
3795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3796   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
3797   /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
3798   /// consequtive numbers.
3799   ///
3800   /// Note that the numbering is applied to the whole VPlan, so printing
3801   /// individual regions is consistent with the whole VPlan printing.
3802   void print(raw_ostream &O, const Twine &Indent,
3803              VPSlotTracker &SlotTracker) const override;
3804   using VPBlockBase::print; // Get the print(raw_stream &O) version.
3805 #endif
3806 
3807   /// Clone all blocks in the single-entry single-exit region of the block and
3808   /// their recipes without updating the operands of the cloned recipes.
3809   VPRegionBlock *clone() override;
3810 };
3811 
3812 /// VPlan models a candidate for vectorization, encoding various decisions take
3813 /// to produce efficient output IR, including which branches, basic-blocks and
3814 /// output IR instructions to generate, and their cost. VPlan holds a
3815 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
3816 /// VPBasicBlock.
3817 class VPlan {
3818   friend class VPlanPrinter;
3819   friend class VPSlotTracker;
3820 
3821   /// VPBasicBlock corresponding to the original preheader. Used to place
3822   /// VPExpandSCEV recipes for expressions used during skeleton creation and the
3823   /// rest of VPlan execution.
3824   /// When this VPlan is used for the epilogue vector loop, the entry will be
3825   /// replaced by a new entry block created during skeleton creation.
3826   VPBasicBlock *Entry;
3827 
3828   /// VPIRBasicBlock wrapping the header of the original scalar loop.
3829   VPIRBasicBlock *ScalarHeader;
3830 
3831   /// Holds the VFs applicable to this VPlan.
3832   SmallSetVector<ElementCount, 2> VFs;
3833 
3834   /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for
3835   /// any UF.
3836   SmallSetVector<unsigned, 2> UFs;
3837 
3838   /// Holds the name of the VPlan, for printing.
3839   std::string Name;
3840 
3841   /// Represents the trip count of the original loop, for folding
3842   /// the tail.
3843   VPValue *TripCount = nullptr;
3844 
3845   /// Represents the backedge taken count of the original loop, for folding
3846   /// the tail. It equals TripCount - 1.
3847   VPValue *BackedgeTakenCount = nullptr;
3848 
3849   /// Represents the vector trip count.
3850   VPValue VectorTripCount;
3851 
3852   /// Represents the vectorization factor of the loop.
3853   VPValue VF;
3854 
3855   /// Represents the loop-invariant VF * UF of the vector loop region.
3856   VPValue VFxUF;
3857 
3858   /// Holds a mapping between Values and their corresponding VPValue inside
3859   /// VPlan.
3860   Value2VPValueTy Value2VPValue;
3861 
3862   /// Contains all the external definitions created for this VPlan. External
3863   /// definitions are VPValues that hold a pointer to their underlying IR.
3864   SmallVector<VPValue *, 16> VPLiveInsToFree;
3865 
3866   /// Mapping from SCEVs to the VPValues representing their expansions.
3867   /// NOTE: This mapping is temporary and will be removed once all users have
3868   /// been modeled in VPlan directly.
3869   DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
3870 
3871   /// Blocks allocated and owned by the VPlan. They will be deleted once the
3872   /// VPlan is destroyed.
3873   SmallVector<VPBlockBase *> CreatedBlocks;
3874 
3875   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
3876   /// wrapping the original header of the scalar loop.
3877   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
3878       : Entry(Entry), ScalarHeader(ScalarHeader) {
3879     Entry->setPlan(this);
3880     assert(ScalarHeader->getNumSuccessors() == 0 &&
3881            "scalar header must be a leaf node");
3882   }
3883 
3884 public:
3885   /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the
3886   /// original preheader and scalar header of \p L, to be used as entry and
3887   /// scalar header blocks of the new VPlan.
3888   VPlan(Loop *L);
3889 
3890   /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock
3891   /// wrapping \p ScalarHeaderBB and a trip count of \p TC.
3892   VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) {
3893     setEntry(createVPBasicBlock("preheader"));
3894     ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB);
3895     TripCount = TC;
3896   }
3897 
3898   ~VPlan();
3899 
3900   void setEntry(VPBasicBlock *VPBB) {
3901     Entry = VPBB;
3902     VPBB->setPlan(this);
3903   }
3904 
3905   /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
3906   /// original scalar pre-header) which contains SCEV expansions that need
3907   /// to happen before the CFG is modified (when executing a VPlan for the
3908   /// epilogue vector loop, the original entry needs to be replaced by a new
3909   /// one); a VPBasicBlock for the vector pre-header, followed by a region for
3910   /// the vector loop, followed by the middle VPBasicBlock. If a check is needed
3911   /// to guard executing the scalar epilogue loop, it will be added to the
3912   /// middle block, together with VPBasicBlocks for the scalar preheader and
3913   /// exit blocks. \p InductionTy is the type of the canonical induction and
3914   /// used for related values, like the trip count expression.
3915   static VPlanPtr createInitialVPlan(Type *InductionTy,
3916                                      PredicatedScalarEvolution &PSE,
3917                                      bool RequiresScalarEpilogueCheck,
3918                                      bool TailFolded, Loop *TheLoop);
3919 
3920   /// Prepare the plan for execution, setting up the required live-in values.
3921   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
3922                         VPTransformState &State);
3923 
3924   /// Generate the IR code for this VPlan.
3925   void execute(VPTransformState *State);
3926 
3927   /// Return the cost of this plan.
3928   InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
3929 
3930   VPBasicBlock *getEntry() { return Entry; }
3931   const VPBasicBlock *getEntry() const { return Entry; }
3932 
3933   /// Returns the preheader of the vector loop region, if one exists, or null
3934   /// otherwise.
3935   VPBasicBlock *getVectorPreheader() {
3936     VPRegionBlock *VectorRegion = getVectorLoopRegion();
3937     return VectorRegion
3938                ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())
3939                : nullptr;
3940   }
3941 
3942   /// Returns the VPRegionBlock of the vector loop.
3943   VPRegionBlock *getVectorLoopRegion();
3944   const VPRegionBlock *getVectorLoopRegion() const;
3945 
3946   /// Returns the 'middle' block of the plan, that is the block that selects
3947   /// whether to execute the scalar tail loop or the exit block from the loop
3948   /// latch.
3949   const VPBasicBlock *getMiddleBlock() const {
3950     return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
3951   }
3952   VPBasicBlock *getMiddleBlock() {
3953     return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front());
3954   }
3955 
3956   /// Return the VPBasicBlock for the preheader of the scalar loop.
3957   VPBasicBlock *getScalarPreheader() const {
3958     return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor());
3959   }
3960 
3961   /// Return the VPIRBasicBlock wrapping the header of the scalar loop.
3962   VPIRBasicBlock *getScalarHeader() const { return ScalarHeader; }
3963 
3964   /// Return an iterator range over the VPIRBasicBlock wrapping the exit blocks
3965   /// of the VPlan, that is leaf nodes except the scalar header. Defined in
3966   /// VPlanHCFG, as the definition of the type needs access to the definitions
3967   /// of VPBlockShallowTraversalWrapper.
3968   auto getExitBlocks();
3969 
3970   /// The trip count of the original loop.
3971   VPValue *getTripCount() const {
3972     assert(TripCount && "trip count needs to be set before accessing it");
3973     return TripCount;
3974   }
3975 
3976   /// Resets the trip count for the VPlan. The caller must make sure all uses of
3977   /// the original trip count have been replaced.
3978   void resetTripCount(VPValue *NewTripCount) {
3979     assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
3980            "TripCount always must be set");
3981     TripCount = NewTripCount;
3982   }
3983 
3984   /// The backedge taken count of the original loop.
3985   VPValue *getOrCreateBackedgeTakenCount() {
3986     if (!BackedgeTakenCount)
3987       BackedgeTakenCount = new VPValue();
3988     return BackedgeTakenCount;
3989   }
3990 
3991   /// The vector trip count.
3992   VPValue &getVectorTripCount() { return VectorTripCount; }
3993 
3994   /// Returns the VF of the vector loop region.
3995   VPValue &getVF() { return VF; };
3996 
3997   /// Returns VF * UF of the vector loop region.
3998   VPValue &getVFxUF() { return VFxUF; }
3999 
4000   void addVF(ElementCount VF) { VFs.insert(VF); }
4001 
4002   void setVF(ElementCount VF) {
4003     assert(hasVF(VF) && "Cannot set VF not already in plan");
4004     VFs.clear();
4005     VFs.insert(VF);
4006   }
4007 
4008   bool hasVF(ElementCount VF) { return VFs.count(VF); }
4009   bool hasScalableVF() {
4010     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
4011   }
4012 
4013   /// Returns an iterator range over all VFs of the plan.
4014   iterator_range<SmallSetVector<ElementCount, 2>::iterator>
4015   vectorFactors() const {
4016     return {VFs.begin(), VFs.end()};
4017   }
4018 
4019   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
4020 
4021   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
4022 
4023   unsigned getUF() const {
4024     assert(UFs.size() == 1 && "Expected a single UF");
4025     return UFs[0];
4026   }
4027 
4028   void setUF(unsigned UF) {
4029     assert(hasUF(UF) && "Cannot set the UF not already in plan");
4030     UFs.clear();
4031     UFs.insert(UF);
4032   }
4033 
4034   /// Return a string with the name of the plan and the applicable VFs and UFs.
4035   std::string getName() const;
4036 
4037   void setName(const Twine &newName) { Name = newName.str(); }
4038 
4039   /// Gets the live-in VPValue for \p V or adds a new live-in (if none exists
4040   ///  yet) for \p V.
4041   VPValue *getOrAddLiveIn(Value *V) {
4042     assert(V && "Trying to get or add the VPValue of a null Value");
4043     if (!Value2VPValue.count(V)) {
4044       VPValue *VPV = new VPValue(V);
4045       VPLiveInsToFree.push_back(VPV);
4046       assert(VPV->isLiveIn() && "VPV must be a live-in.");
4047       assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
4048       Value2VPValue[V] = VPV;
4049     }
4050 
4051     assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
4052     assert(Value2VPValue[V]->isLiveIn() &&
4053            "Only live-ins should be in mapping");
4054     return Value2VPValue[V];
4055   }
4056 
4057   /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
4058   VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
4059 
4060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4061   /// Print the live-ins of this VPlan to \p O.
4062   void printLiveIns(raw_ostream &O) const;
4063 
4064   /// Print this VPlan to \p O.
4065   void print(raw_ostream &O) const;
4066 
4067   /// Print this VPlan in DOT format to \p O.
4068   void printDOT(raw_ostream &O) const;
4069 
4070   /// Dump the plan to stderr (for debugging).
4071   LLVM_DUMP_METHOD void dump() const;
4072 #endif
4073 
4074   /// Returns the canonical induction recipe of the vector loop.
4075   VPCanonicalIVPHIRecipe *getCanonicalIV() {
4076     VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
4077     if (EntryVPBB->empty()) {
4078       // VPlan native path.
4079       EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
4080     }
4081     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
4082   }
4083 
4084   VPValue *getSCEVExpansion(const SCEV *S) const {
4085     return SCEVToExpansion.lookup(S);
4086   }
4087 
4088   void addSCEVExpansion(const SCEV *S, VPValue *V) {
4089     assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
4090     SCEVToExpansion[S] = V;
4091   }
4092 
4093   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
4094   /// recipes to refer to the clones, and return it.
4095   VPlan *duplicate();
4096 
4097   /// Create a new VPBasicBlock with \p Name and containing \p Recipe if
4098   /// present. The returned block is owned by the VPlan and deleted once the
4099   /// VPlan is destroyed.
4100   VPBasicBlock *createVPBasicBlock(const Twine &Name,
4101                                    VPRecipeBase *Recipe = nullptr) {
4102     auto *VPB = new VPBasicBlock(Name, Recipe);
4103     CreatedBlocks.push_back(VPB);
4104     return VPB;
4105   }
4106 
4107   /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
4108   /// IsReplicator is true, the region is a replicate region. The returned block
4109   /// is owned by the VPlan and deleted once the VPlan is destroyed.
4110   VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
4111                                      const std::string &Name = "",
4112                                      bool IsReplicator = false) {
4113     auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
4114     CreatedBlocks.push_back(VPB);
4115     return VPB;
4116   }
4117 
4118   /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
4119   /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
4120   /// The returned block is owned by the VPlan and deleted once the VPlan is
4121   /// destroyed.
4122   VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
4123                                      bool IsReplicator = false) {
4124     auto *VPB = new VPRegionBlock(Name, IsReplicator);
4125     CreatedBlocks.push_back(VPB);
4126     return VPB;
4127   }
4128 
4129   /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create
4130   /// VPIRInstructions wrapping the instructions in t\p IRBB.  The returned
4131   /// block is owned by the VPlan and deleted once the VPlan is destroyed.
4132   VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB);
4133 
4134   /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all
4135   /// instructions in \p IRBB, except its terminator which is managed by the
4136   /// successors of the block in VPlan. The returned block is owned by the VPlan
4137   /// and deleted once the VPlan is destroyed.
4138   VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
4139 };
4140 
4141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4142 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
4143 /// indented and follows the dot format.
4144 class VPlanPrinter {
4145   raw_ostream &OS;
4146   const VPlan &Plan;
4147   unsigned Depth = 0;
4148   unsigned TabWidth = 2;
4149   std::string Indent;
4150   unsigned BID = 0;
4151   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
4152 
4153   VPSlotTracker SlotTracker;
4154 
4155   /// Handle indentation.
4156   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
4157 
4158   /// Print a given \p Block of the Plan.
4159   void dumpBlock(const VPBlockBase *Block);
4160 
4161   /// Print the information related to the CFG edges going out of a given
4162   /// \p Block, followed by printing the successor blocks themselves.
4163   void dumpEdges(const VPBlockBase *Block);
4164 
4165   /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
4166   /// its successor blocks.
4167   void dumpBasicBlock(const VPBasicBlock *BasicBlock);
4168 
4169   /// Print a given \p Region of the Plan.
4170   void dumpRegion(const VPRegionBlock *Region);
4171 
4172   unsigned getOrCreateBID(const VPBlockBase *Block) {
4173     return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
4174   }
4175 
4176   Twine getOrCreateName(const VPBlockBase *Block);
4177 
4178   Twine getUID(const VPBlockBase *Block);
4179 
4180   /// Print the information related to a CFG edge between two VPBlockBases.
4181   void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
4182                 const Twine &Label);
4183 
4184 public:
4185   VPlanPrinter(raw_ostream &O, const VPlan &P)
4186       : OS(O), Plan(P), SlotTracker(&P) {}
4187 
4188   LLVM_DUMP_METHOD void dump();
4189 };
4190 
4191 struct VPlanIngredient {
4192   const Value *V;
4193 
4194   VPlanIngredient(const Value *V) : V(V) {}
4195 
4196   void print(raw_ostream &O) const;
4197 };
4198 
4199 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
4200   I.print(OS);
4201   return OS;
4202 }
4203 
4204 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
4205   Plan.print(OS);
4206   return OS;
4207 }
4208 #endif
4209 
4210 class VPInterleavedAccessInfo {
4211   DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
4212       InterleaveGroupMap;
4213 
4214   /// Type for mapping of instruction based interleave groups to VPInstruction
4215   /// interleave groups
4216   using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
4217                              InterleaveGroup<VPInstruction> *>;
4218 
4219   /// Recursively \p Region and populate VPlan based interleave groups based on
4220   /// \p IAI.
4221   void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
4222                    InterleavedAccessInfo &IAI);
4223   /// Recursively traverse \p Block and populate VPlan based interleave groups
4224   /// based on \p IAI.
4225   void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
4226                   InterleavedAccessInfo &IAI);
4227 
4228 public:
4229   VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
4230 
4231   ~VPInterleavedAccessInfo() {
4232     SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
4233     // Avoid releasing a pointer twice.
4234     for (auto &I : InterleaveGroupMap)
4235       DelSet.insert(I.second);
4236     for (auto *Ptr : DelSet)
4237       delete Ptr;
4238   }
4239 
4240   /// Get the interleave group that \p Instr belongs to.
4241   ///
4242   /// \returns nullptr if doesn't have such group.
4243   InterleaveGroup<VPInstruction> *
4244   getInterleaveGroup(VPInstruction *Instr) const {
4245     return InterleaveGroupMap.lookup(Instr);
4246   }
4247 };
4248 
4249 /// Class that maps (parts of) an existing VPlan to trees of combined
4250 /// VPInstructions.
4251 class VPlanSlp {
4252   enum class OpMode { Failed, Load, Opcode };
4253 
4254   /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
4255   /// DenseMap keys.
4256   struct BundleDenseMapInfo {
4257     static SmallVector<VPValue *, 4> getEmptyKey() {
4258       return {reinterpret_cast<VPValue *>(-1)};
4259     }
4260 
4261     static SmallVector<VPValue *, 4> getTombstoneKey() {
4262       return {reinterpret_cast<VPValue *>(-2)};
4263     }
4264 
4265     static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
4266       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4267     }
4268 
4269     static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
4270                         const SmallVector<VPValue *, 4> &RHS) {
4271       return LHS == RHS;
4272     }
4273   };
4274 
4275   /// Mapping of values in the original VPlan to a combined VPInstruction.
4276   DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
4277       BundleToCombined;
4278 
4279   VPInterleavedAccessInfo &IAI;
4280 
4281   /// Basic block to operate on. For now, only instructions in a single BB are
4282   /// considered.
4283   const VPBasicBlock &BB;
4284 
4285   /// Indicates whether we managed to combine all visited instructions or not.
4286   bool CompletelySLP = true;
4287 
4288   /// Width of the widest combined bundle in bits.
4289   unsigned WidestBundleBits = 0;
4290 
4291   using MultiNodeOpTy =
4292       typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
4293 
4294   // Input operand bundles for the current multi node. Each multi node operand
4295   // bundle contains values not matching the multi node's opcode. They will
4296   // be reordered in reorderMultiNodeOps, once we completed building a
4297   // multi node.
4298   SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
4299 
4300   /// Indicates whether we are building a multi node currently.
4301   bool MultiNodeActive = false;
4302 
4303   /// Check if we can vectorize Operands together.
4304   bool areVectorizable(ArrayRef<VPValue *> Operands) const;
4305 
4306   /// Add combined instruction \p New for the bundle \p Operands.
4307   void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
4308 
4309   /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
4310   VPInstruction *markFailed();
4311 
4312   /// Reorder operands in the multi node to maximize sequential memory access
4313   /// and commutative operations.
4314   SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
4315 
4316   /// Choose the best candidate to use for the lane after \p Last. The set of
4317   /// candidates to choose from are values with an opcode matching \p Last's
4318   /// or loads consecutive to \p Last.
4319   std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
4320                                        SmallPtrSetImpl<VPValue *> &Candidates,
4321                                        VPInterleavedAccessInfo &IAI);
4322 
4323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4324   /// Print bundle \p Values to dbgs().
4325   void dumpBundle(ArrayRef<VPValue *> Values);
4326 #endif
4327 
4328 public:
4329   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
4330 
4331   ~VPlanSlp() = default;
4332 
4333   /// Tries to build an SLP tree rooted at \p Operands and returns a
4334   /// VPInstruction combining \p Operands, if they can be combined.
4335   VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
4336 
4337   /// Return the width of the widest combined bundle in bits.
4338   unsigned getWidestBundleBits() const { return WidestBundleBits; }
4339 
4340   /// Return true if all visited instruction can be combined.
4341   bool isCompletelySLP() const { return CompletelySLP; }
4342 };
4343 } // end namespace llvm
4344 
4345 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
4346