1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file contains the declarations of the Vectorization Plan base classes: 11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual 12 /// VPBlockBase, together implementing a Hierarchical CFG; 13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained 14 /// within VPBasicBlocks; 15 /// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that 16 /// also inherit from VPValue. 17 /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned 18 /// instruction; 19 /// 5. The VPlan class holding a candidate for vectorization; 20 /// 6. The VPlanPrinter class providing a way to print a plan in dot format; 21 /// These are documented in docs/VectorizationPlan.rst. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 26 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 27 28 #include "VPlanAnalysis.h" 29 #include "VPlanValue.h" 30 #include "llvm/ADT/DenseMap.h" 31 #include "llvm/ADT/SmallBitVector.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallVector.h" 34 #include "llvm/ADT/Twine.h" 35 #include "llvm/ADT/ilist.h" 36 #include "llvm/ADT/ilist_node.h" 37 #include "llvm/Analysis/DomTreeUpdater.h" 38 #include "llvm/Analysis/IVDescriptors.h" 39 #include "llvm/Analysis/LoopInfo.h" 40 #include "llvm/Analysis/TargetTransformInfo.h" 41 #include "llvm/Analysis/VectorUtils.h" 42 #include "llvm/IR/DebugLoc.h" 43 #include "llvm/IR/FMF.h" 44 #include "llvm/IR/Operator.h" 45 #include "llvm/Support/InstructionCost.h" 46 #include <algorithm> 47 #include <cassert> 48 #include <cstddef> 49 #include <string> 50 51 namespace llvm { 52 53 class BasicBlock; 54 class DominatorTree; 55 class InnerLoopVectorizer; 56 class IRBuilderBase; 57 class LoopInfo; 58 class raw_ostream; 59 class RecurrenceDescriptor; 60 class SCEV; 61 class Type; 62 class VPBasicBlock; 63 class VPBuilder; 64 class VPRegionBlock; 65 class VPlan; 66 class VPReplicateRecipe; 67 class VPlanSlp; 68 class Value; 69 class LoopVectorizationCostModel; 70 class LoopVersioning; 71 72 struct VPCostContext; 73 74 namespace Intrinsic { 75 typedef unsigned ID; 76 } 77 78 /// Returns a calculation for the total number of elements for a given \p VF. 79 /// For fixed width vectors this value is a constant, whereas for scalable 80 /// vectors it is an expression determined at runtime. 81 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); 82 83 /// Return a value for Step multiplied by VF. 84 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 85 int64_t Step); 86 87 /// A helper function that returns the reciprocal of the block probability of 88 /// predicated blocks. If we return X, we are assuming the predicated block 89 /// will execute once for every X iterations of the loop header. 90 /// 91 /// TODO: We should use actual block probability here, if available. Currently, 92 /// we always assume predicated blocks have a 50% chance of executing. 93 inline unsigned getReciprocalPredBlockProb() { return 2; } 94 95 /// A range of powers-of-2 vectorization factors with fixed start and 96 /// adjustable end. The range includes start and excludes end, e.g.,: 97 /// [1, 16) = {1, 2, 4, 8} 98 struct VFRange { 99 // A power of 2. 100 const ElementCount Start; 101 102 // A power of 2. If End <= Start range is empty. 103 ElementCount End; 104 105 bool isEmpty() const { 106 return End.getKnownMinValue() <= Start.getKnownMinValue(); 107 } 108 109 VFRange(const ElementCount &Start, const ElementCount &End) 110 : Start(Start), End(End) { 111 assert(Start.isScalable() == End.isScalable() && 112 "Both Start and End should have the same scalable flag"); 113 assert(isPowerOf2_32(Start.getKnownMinValue()) && 114 "Expected Start to be a power of 2"); 115 assert(isPowerOf2_32(End.getKnownMinValue()) && 116 "Expected End to be a power of 2"); 117 } 118 119 /// Iterator to iterate over vectorization factors in a VFRange. 120 class iterator 121 : public iterator_facade_base<iterator, std::forward_iterator_tag, 122 ElementCount> { 123 ElementCount VF; 124 125 public: 126 iterator(ElementCount VF) : VF(VF) {} 127 128 bool operator==(const iterator &Other) const { return VF == Other.VF; } 129 130 ElementCount operator*() const { return VF; } 131 132 iterator &operator++() { 133 VF *= 2; 134 return *this; 135 } 136 }; 137 138 iterator begin() { return iterator(Start); } 139 iterator end() { 140 assert(isPowerOf2_32(End.getKnownMinValue())); 141 return iterator(End); 142 } 143 }; 144 145 using VPlanPtr = std::unique_ptr<VPlan>; 146 147 /// In what follows, the term "input IR" refers to code that is fed into the 148 /// vectorizer whereas the term "output IR" refers to code that is generated by 149 /// the vectorizer. 150 151 /// VPLane provides a way to access lanes in both fixed width and scalable 152 /// vectors, where for the latter the lane index sometimes needs calculating 153 /// as a runtime expression. 154 class VPLane { 155 public: 156 /// Kind describes how to interpret Lane. 157 enum class Kind : uint8_t { 158 /// For First, Lane is the index into the first N elements of a 159 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>. 160 First, 161 /// For ScalableLast, Lane is the offset from the start of the last 162 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For 163 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of 164 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. 165 ScalableLast 166 }; 167 168 private: 169 /// in [0..VF) 170 unsigned Lane; 171 172 /// Indicates how the Lane should be interpreted, as described above. 173 Kind LaneKind; 174 175 public: 176 VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {} 177 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} 178 179 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } 180 181 static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) { 182 assert(Offset > 0 && Offset <= VF.getKnownMinValue() && 183 "trying to extract with invalid offset"); 184 unsigned LaneOffset = VF.getKnownMinValue() - Offset; 185 Kind LaneKind; 186 if (VF.isScalable()) 187 // In this case 'LaneOffset' refers to the offset from the start of the 188 // last subvector with VF.getKnownMinValue() elements. 189 LaneKind = VPLane::Kind::ScalableLast; 190 else 191 LaneKind = VPLane::Kind::First; 192 return VPLane(LaneOffset, LaneKind); 193 } 194 195 static VPLane getLastLaneForVF(const ElementCount &VF) { 196 return getLaneFromEnd(VF, 1); 197 } 198 199 /// Returns a compile-time known value for the lane index and asserts if the 200 /// lane can only be calculated at runtime. 201 unsigned getKnownLane() const { 202 assert(LaneKind == Kind::First); 203 return Lane; 204 } 205 206 /// Returns an expression describing the lane index that can be used at 207 /// runtime. 208 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const; 209 210 /// Returns the Kind of lane offset. 211 Kind getKind() const { return LaneKind; } 212 213 /// Returns true if this is the first lane of the whole vector. 214 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } 215 216 /// Maps the lane to a cache index based on \p VF. 217 unsigned mapToCacheIndex(const ElementCount &VF) const { 218 switch (LaneKind) { 219 case VPLane::Kind::ScalableLast: 220 assert(VF.isScalable() && Lane < VF.getKnownMinValue()); 221 return VF.getKnownMinValue() + Lane; 222 default: 223 assert(Lane < VF.getKnownMinValue()); 224 return Lane; 225 } 226 } 227 }; 228 229 /// VPTransformState holds information passed down when "executing" a VPlan, 230 /// needed for generating the output IR. 231 struct VPTransformState { 232 VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, 233 LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, 234 InnerLoopVectorizer *ILV, VPlan *Plan, 235 Loop *CurrentParentLoop, Type *CanonicalIVTy); 236 /// Target Transform Info. 237 const TargetTransformInfo *TTI; 238 239 /// The chosen Vectorization Factor of the loop being vectorized. 240 ElementCount VF; 241 242 /// Hold the index to generate specific scalar instructions. Null indicates 243 /// that all instances are to be generated, using either scalar or vector 244 /// instructions. 245 std::optional<VPLane> Lane; 246 247 struct DataState { 248 // Each value from the original loop, when vectorized, is represented by a 249 // vector value in the map. 250 DenseMap<VPValue *, Value *> VPV2Vector; 251 252 DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars; 253 } Data; 254 255 /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar 256 /// is false, otherwise return the generated scalar. \See set. 257 Value *get(VPValue *Def, bool IsScalar = false); 258 259 /// Get the generated Value for a given VPValue and given Part and Lane. 260 Value *get(VPValue *Def, const VPLane &Lane); 261 262 bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); } 263 264 bool hasScalarValue(VPValue *Def, VPLane Lane) { 265 auto I = Data.VPV2Scalars.find(Def); 266 if (I == Data.VPV2Scalars.end()) 267 return false; 268 unsigned CacheIdx = Lane.mapToCacheIndex(VF); 269 return CacheIdx < I->second.size() && I->second[CacheIdx]; 270 } 271 272 /// Set the generated vector Value for a given VPValue, if \p 273 /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0. 274 void set(VPValue *Def, Value *V, bool IsScalar = false) { 275 if (IsScalar) { 276 set(Def, V, VPLane(0)); 277 return; 278 } 279 assert((VF.isScalar() || V->getType()->isVectorTy()) && 280 "scalar values must be stored as (0, 0)"); 281 Data.VPV2Vector[Def] = V; 282 } 283 284 /// Reset an existing vector value for \p Def and a given \p Part. 285 void reset(VPValue *Def, Value *V) { 286 assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value"); 287 Data.VPV2Vector[Def] = V; 288 } 289 290 /// Set the generated scalar \p V for \p Def and the given \p Lane. 291 void set(VPValue *Def, Value *V, const VPLane &Lane) { 292 auto &Scalars = Data.VPV2Scalars[Def]; 293 unsigned CacheIdx = Lane.mapToCacheIndex(VF); 294 if (Scalars.size() <= CacheIdx) 295 Scalars.resize(CacheIdx + 1); 296 assert(!Scalars[CacheIdx] && "should overwrite existing value"); 297 Scalars[CacheIdx] = V; 298 } 299 300 /// Reset an existing scalar value for \p Def and a given \p Lane. 301 void reset(VPValue *Def, Value *V, const VPLane &Lane) { 302 auto Iter = Data.VPV2Scalars.find(Def); 303 assert(Iter != Data.VPV2Scalars.end() && 304 "need to overwrite existing value"); 305 unsigned CacheIdx = Lane.mapToCacheIndex(VF); 306 assert(CacheIdx < Iter->second.size() && 307 "need to overwrite existing value"); 308 Iter->second[CacheIdx] = V; 309 } 310 311 /// Add additional metadata to \p To that was not present on \p Orig. 312 /// 313 /// Currently this is used to add the noalias annotations based on the 314 /// inserted memchecks. Use this for instructions that are *cloned* into the 315 /// vector loop. 316 void addNewMetadata(Instruction *To, const Instruction *Orig); 317 318 /// Add metadata from one instruction to another. 319 /// 320 /// This includes both the original MDs from \p From and additional ones (\see 321 /// addNewMetadata). Use this for *newly created* instructions in the vector 322 /// loop. 323 void addMetadata(Value *To, Instruction *From); 324 325 /// Set the debug location in the builder using the debug location \p DL. 326 void setDebugLocFrom(DebugLoc DL); 327 328 /// Construct the vector value of a scalarized value \p V one lane at a time. 329 void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); 330 331 /// Hold state information used when constructing the CFG of the output IR, 332 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. 333 struct CFGState { 334 /// The previous VPBasicBlock visited. Initially set to null. 335 VPBasicBlock *PrevVPBB = nullptr; 336 337 /// The previous IR BasicBlock created or used. Initially set to the new 338 /// header BasicBlock. 339 BasicBlock *PrevBB = nullptr; 340 341 /// The last IR BasicBlock in the output IR. Set to the exit block of the 342 /// vector loop. 343 BasicBlock *ExitBB = nullptr; 344 345 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case 346 /// of replication, maps the BasicBlock of the last replica created. 347 SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; 348 349 /// Updater for the DominatorTree. 350 DomTreeUpdater DTU; 351 352 CFGState(DominatorTree *DT) 353 : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {} 354 355 /// Returns the BasicBlock* mapped to the pre-header of the loop region 356 /// containing \p R. 357 BasicBlock *getPreheaderBBFor(VPRecipeBase *R); 358 } CFG; 359 360 /// Hold a pointer to LoopInfo to register new basic blocks in the loop. 361 LoopInfo *LI; 362 363 /// Hold a reference to the IRBuilder used to generate output IR code. 364 IRBuilderBase &Builder; 365 366 /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. 367 InnerLoopVectorizer *ILV; 368 369 /// Pointer to the VPlan code is generated for. 370 VPlan *Plan; 371 372 /// The parent loop object for the current scope, or nullptr. 373 Loop *CurrentParentLoop = nullptr; 374 375 /// LoopVersioning. It's only set up (non-null) if memchecks were 376 /// used. 377 /// 378 /// This is currently only used to add no-alias metadata based on the 379 /// memchecks. The actually versioning is performed manually. 380 LoopVersioning *LVer = nullptr; 381 382 /// Map SCEVs to their expanded values. Populated when executing 383 /// VPExpandSCEVRecipes. 384 DenseMap<const SCEV *, Value *> ExpandedSCEVs; 385 386 /// VPlan-based type analysis. 387 VPTypeAnalysis TypeAnalysis; 388 }; 389 390 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. 391 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. 392 class VPBlockBase { 393 friend class VPBlockUtils; 394 395 const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). 396 397 /// An optional name for the block. 398 std::string Name; 399 400 /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if 401 /// it is a topmost VPBlockBase. 402 VPRegionBlock *Parent = nullptr; 403 404 /// List of predecessor blocks. 405 SmallVector<VPBlockBase *, 1> Predecessors; 406 407 /// List of successor blocks. 408 SmallVector<VPBlockBase *, 1> Successors; 409 410 /// VPlan containing the block. Can only be set on the entry block of the 411 /// plan. 412 VPlan *Plan = nullptr; 413 414 /// Add \p Successor as the last successor to this block. 415 void appendSuccessor(VPBlockBase *Successor) { 416 assert(Successor && "Cannot add nullptr successor!"); 417 Successors.push_back(Successor); 418 } 419 420 /// Add \p Predecessor as the last predecessor to this block. 421 void appendPredecessor(VPBlockBase *Predecessor) { 422 assert(Predecessor && "Cannot add nullptr predecessor!"); 423 Predecessors.push_back(Predecessor); 424 } 425 426 /// Remove \p Predecessor from the predecessors of this block. 427 void removePredecessor(VPBlockBase *Predecessor) { 428 auto Pos = find(Predecessors, Predecessor); 429 assert(Pos && "Predecessor does not exist"); 430 Predecessors.erase(Pos); 431 } 432 433 /// Remove \p Successor from the successors of this block. 434 void removeSuccessor(VPBlockBase *Successor) { 435 auto Pos = find(Successors, Successor); 436 assert(Pos && "Successor does not exist"); 437 Successors.erase(Pos); 438 } 439 440 /// This function replaces one predecessor with another, useful when 441 /// trying to replace an old block in the CFG with a new one. 442 void replacePredecessor(VPBlockBase *Old, VPBlockBase *New) { 443 auto I = find(Predecessors, Old); 444 assert(I != Predecessors.end()); 445 assert(Old->getParent() == New->getParent() && 446 "replaced predecessor must have the same parent"); 447 *I = New; 448 } 449 450 /// This function replaces one successor with another, useful when 451 /// trying to replace an old block in the CFG with a new one. 452 void replaceSuccessor(VPBlockBase *Old, VPBlockBase *New) { 453 auto I = find(Successors, Old); 454 assert(I != Successors.end()); 455 assert(Old->getParent() == New->getParent() && 456 "replaced successor must have the same parent"); 457 *I = New; 458 } 459 460 protected: 461 VPBlockBase(const unsigned char SC, const std::string &N) 462 : SubclassID(SC), Name(N) {} 463 464 public: 465 /// An enumeration for keeping track of the concrete subclass of VPBlockBase 466 /// that are actually instantiated. Values of this enumeration are kept in the 467 /// SubclassID field of the VPBlockBase objects. They are used for concrete 468 /// type identification. 469 using VPBlockTy = enum { VPRegionBlockSC, VPBasicBlockSC, VPIRBasicBlockSC }; 470 471 using VPBlocksTy = SmallVectorImpl<VPBlockBase *>; 472 473 virtual ~VPBlockBase() = default; 474 475 const std::string &getName() const { return Name; } 476 477 void setName(const Twine &newName) { Name = newName.str(); } 478 479 /// \return an ID for the concrete type of this object. 480 /// This is used to implement the classof checks. This should not be used 481 /// for any other purpose, as the values may change as LLVM evolves. 482 unsigned getVPBlockID() const { return SubclassID; } 483 484 VPRegionBlock *getParent() { return Parent; } 485 const VPRegionBlock *getParent() const { return Parent; } 486 487 /// \return A pointer to the plan containing the current block. 488 VPlan *getPlan(); 489 const VPlan *getPlan() const; 490 491 /// Sets the pointer of the plan containing the block. The block must be the 492 /// entry block into the VPlan. 493 void setPlan(VPlan *ParentPlan); 494 495 void setParent(VPRegionBlock *P) { Parent = P; } 496 497 /// \return the VPBasicBlock that is the entry of this VPBlockBase, 498 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 499 /// VPBlockBase is a VPBasicBlock, it is returned. 500 const VPBasicBlock *getEntryBasicBlock() const; 501 VPBasicBlock *getEntryBasicBlock(); 502 503 /// \return the VPBasicBlock that is the exiting this VPBlockBase, 504 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this 505 /// VPBlockBase is a VPBasicBlock, it is returned. 506 const VPBasicBlock *getExitingBasicBlock() const; 507 VPBasicBlock *getExitingBasicBlock(); 508 509 const VPBlocksTy &getSuccessors() const { return Successors; } 510 VPBlocksTy &getSuccessors() { return Successors; } 511 512 iterator_range<VPBlockBase **> successors() { return Successors; } 513 iterator_range<VPBlockBase **> predecessors() { return Predecessors; } 514 515 const VPBlocksTy &getPredecessors() const { return Predecessors; } 516 VPBlocksTy &getPredecessors() { return Predecessors; } 517 518 /// \return the successor of this VPBlockBase if it has a single successor. 519 /// Otherwise return a null pointer. 520 VPBlockBase *getSingleSuccessor() const { 521 return (Successors.size() == 1 ? *Successors.begin() : nullptr); 522 } 523 524 /// \return the predecessor of this VPBlockBase if it has a single 525 /// predecessor. Otherwise return a null pointer. 526 VPBlockBase *getSinglePredecessor() const { 527 return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); 528 } 529 530 size_t getNumSuccessors() const { return Successors.size(); } 531 size_t getNumPredecessors() const { return Predecessors.size(); } 532 533 /// An Enclosing Block of a block B is any block containing B, including B 534 /// itself. \return the closest enclosing block starting from "this", which 535 /// has successors. \return the root enclosing block if all enclosing blocks 536 /// have no successors. 537 VPBlockBase *getEnclosingBlockWithSuccessors(); 538 539 /// \return the closest enclosing block starting from "this", which has 540 /// predecessors. \return the root enclosing block if all enclosing blocks 541 /// have no predecessors. 542 VPBlockBase *getEnclosingBlockWithPredecessors(); 543 544 /// \return the successors either attached directly to this VPBlockBase or, if 545 /// this VPBlockBase is the exit block of a VPRegionBlock and has no 546 /// successors of its own, search recursively for the first enclosing 547 /// VPRegionBlock that has successors and return them. If no such 548 /// VPRegionBlock exists, return the (empty) successors of the topmost 549 /// VPBlockBase reached. 550 const VPBlocksTy &getHierarchicalSuccessors() { 551 return getEnclosingBlockWithSuccessors()->getSuccessors(); 552 } 553 554 /// \return the hierarchical successor of this VPBlockBase if it has a single 555 /// hierarchical successor. Otherwise return a null pointer. 556 VPBlockBase *getSingleHierarchicalSuccessor() { 557 return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); 558 } 559 560 /// \return the predecessors either attached directly to this VPBlockBase or, 561 /// if this VPBlockBase is the entry block of a VPRegionBlock and has no 562 /// predecessors of its own, search recursively for the first enclosing 563 /// VPRegionBlock that has predecessors and return them. If no such 564 /// VPRegionBlock exists, return the (empty) predecessors of the topmost 565 /// VPBlockBase reached. 566 const VPBlocksTy &getHierarchicalPredecessors() { 567 return getEnclosingBlockWithPredecessors()->getPredecessors(); 568 } 569 570 /// \return the hierarchical predecessor of this VPBlockBase if it has a 571 /// single hierarchical predecessor. Otherwise return a null pointer. 572 VPBlockBase *getSingleHierarchicalPredecessor() { 573 return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); 574 } 575 576 /// Set a given VPBlockBase \p Successor as the single successor of this 577 /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. 578 /// This VPBlockBase must have no successors. 579 void setOneSuccessor(VPBlockBase *Successor) { 580 assert(Successors.empty() && "Setting one successor when others exist."); 581 assert(Successor->getParent() == getParent() && 582 "connected blocks must have the same parent"); 583 appendSuccessor(Successor); 584 } 585 586 /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two 587 /// successors of this VPBlockBase. This VPBlockBase is not added as 588 /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no 589 /// successors. 590 void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { 591 assert(Successors.empty() && "Setting two successors when others exist."); 592 appendSuccessor(IfTrue); 593 appendSuccessor(IfFalse); 594 } 595 596 /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase. 597 /// This VPBlockBase must have no predecessors. This VPBlockBase is not added 598 /// as successor of any VPBasicBlock in \p NewPreds. 599 void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) { 600 assert(Predecessors.empty() && "Block predecessors already set."); 601 for (auto *Pred : NewPreds) 602 appendPredecessor(Pred); 603 } 604 605 /// Set each VPBasicBlock in \p NewSuccss as successor of this VPBlockBase. 606 /// This VPBlockBase must have no successors. This VPBlockBase is not added 607 /// as predecessor of any VPBasicBlock in \p NewSuccs. 608 void setSuccessors(ArrayRef<VPBlockBase *> NewSuccs) { 609 assert(Successors.empty() && "Block successors already set."); 610 for (auto *Succ : NewSuccs) 611 appendSuccessor(Succ); 612 } 613 614 /// Remove all the predecessor of this block. 615 void clearPredecessors() { Predecessors.clear(); } 616 617 /// Remove all the successors of this block. 618 void clearSuccessors() { Successors.clear(); } 619 620 /// Swap successors of the block. The block must have exactly 2 successors. 621 // TODO: This should be part of introducing conditional branch recipes rather 622 // than being independent. 623 void swapSuccessors() { 624 assert(Successors.size() == 2 && "must have 2 successors to swap"); 625 std::swap(Successors[0], Successors[1]); 626 } 627 628 /// The method which generates the output IR that correspond to this 629 /// VPBlockBase, thereby "executing" the VPlan. 630 virtual void execute(VPTransformState *State) = 0; 631 632 /// Return the cost of the block. 633 virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; 634 635 /// Return true if it is legal to hoist instructions into this block. 636 bool isLegalToHoistInto() { 637 // There are currently no constraints that prevent an instruction to be 638 // hoisted into a VPBlockBase. 639 return true; 640 } 641 642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 643 void printAsOperand(raw_ostream &OS, bool PrintType = false) const { 644 OS << getName(); 645 } 646 647 /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines 648 /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using 649 /// consequtive numbers. 650 /// 651 /// Note that the numbering is applied to the whole VPlan, so printing 652 /// individual blocks is consistent with the whole VPlan printing. 653 virtual void print(raw_ostream &O, const Twine &Indent, 654 VPSlotTracker &SlotTracker) const = 0; 655 656 /// Print plain-text dump of this VPlan to \p O. 657 void print(raw_ostream &O) const { 658 VPSlotTracker SlotTracker(getPlan()); 659 print(O, "", SlotTracker); 660 } 661 662 /// Print the successors of this block to \p O, prefixing all lines with \p 663 /// Indent. 664 void printSuccessors(raw_ostream &O, const Twine &Indent) const; 665 666 /// Dump this VPBlockBase to dbgs(). 667 LLVM_DUMP_METHOD void dump() const { print(dbgs()); } 668 #endif 669 670 /// Clone the current block and it's recipes without updating the operands of 671 /// the cloned recipes, including all blocks in the single-entry single-exit 672 /// region for VPRegionBlocks. 673 virtual VPBlockBase *clone() = 0; 674 }; 675 676 /// Struct to hold various analysis needed for cost computations. 677 struct VPCostContext { 678 const TargetTransformInfo &TTI; 679 const TargetLibraryInfo &TLI; 680 VPTypeAnalysis Types; 681 LLVMContext &LLVMCtx; 682 LoopVectorizationCostModel &CM; 683 SmallPtrSet<Instruction *, 8> SkipCostComputation; 684 TargetTransformInfo::TargetCostKind CostKind; 685 686 VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, 687 Type *CanIVTy, LoopVectorizationCostModel &CM, 688 TargetTransformInfo::TargetCostKind CostKind) 689 : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), 690 CM(CM), CostKind(CostKind) {} 691 692 /// Return the cost for \p UI with \p VF using the legacy cost model as 693 /// fallback until computing the cost of all recipes migrates to VPlan. 694 InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const; 695 696 /// Return true if the cost for \p UI shouldn't be computed, e.g. because it 697 /// has already been pre-computed. 698 bool skipCostComputation(Instruction *UI, bool IsVector) const; 699 700 /// Returns the OperandInfo for \p V, if it is a live-in. 701 TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; 702 }; 703 704 /// VPRecipeBase is a base class modeling a sequence of one or more output IR 705 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef 706 /// and is responsible for deleting its defined values. Single-value 707 /// recipes must inherit from VPSingleDef instead of inheriting from both 708 /// VPRecipeBase and VPValue separately. 709 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, 710 public VPDef, 711 public VPUser { 712 friend VPBasicBlock; 713 friend class VPBlockUtils; 714 715 /// Each VPRecipe belongs to a single VPBasicBlock. 716 VPBasicBlock *Parent = nullptr; 717 718 /// The debug location for the recipe. 719 DebugLoc DL; 720 721 public: 722 VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, 723 DebugLoc DL = {}) 724 : VPDef(SC), VPUser(Operands), DL(DL) {} 725 726 template <typename IterT> 727 VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands, 728 DebugLoc DL = {}) 729 : VPDef(SC), VPUser(Operands), DL(DL) {} 730 virtual ~VPRecipeBase() = default; 731 732 /// Clone the current recipe. 733 virtual VPRecipeBase *clone() = 0; 734 735 /// \return the VPBasicBlock which this VPRecipe belongs to. 736 VPBasicBlock *getParent() { return Parent; } 737 const VPBasicBlock *getParent() const { return Parent; } 738 739 /// The method which generates the output IR instructions that correspond to 740 /// this VPRecipe, thereby "executing" the VPlan. 741 virtual void execute(VPTransformState &State) = 0; 742 743 /// Return the cost of this recipe, taking into account if the cost 744 /// computation should be skipped and the ForceTargetInstructionCost flag. 745 /// Also takes care of printing the cost for debugging. 746 InstructionCost cost(ElementCount VF, VPCostContext &Ctx); 747 748 /// Insert an unlinked recipe into a basic block immediately before 749 /// the specified recipe. 750 void insertBefore(VPRecipeBase *InsertPos); 751 /// Insert an unlinked recipe into \p BB immediately before the insertion 752 /// point \p IP; 753 void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP); 754 755 /// Insert an unlinked Recipe into a basic block immediately after 756 /// the specified Recipe. 757 void insertAfter(VPRecipeBase *InsertPos); 758 759 /// Unlink this recipe from its current VPBasicBlock and insert it into 760 /// the VPBasicBlock that MovePos lives in, right after MovePos. 761 void moveAfter(VPRecipeBase *MovePos); 762 763 /// Unlink this recipe and insert into BB before I. 764 /// 765 /// \pre I is a valid iterator into BB. 766 void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I); 767 768 /// This method unlinks 'this' from the containing basic block, but does not 769 /// delete it. 770 void removeFromParent(); 771 772 /// This method unlinks 'this' from the containing basic block and deletes it. 773 /// 774 /// \returns an iterator pointing to the element after the erased one 775 iplist<VPRecipeBase>::iterator eraseFromParent(); 776 777 /// Method to support type inquiry through isa, cast, and dyn_cast. 778 static inline bool classof(const VPDef *D) { 779 // All VPDefs are also VPRecipeBases. 780 return true; 781 } 782 783 static inline bool classof(const VPUser *U) { return true; } 784 785 /// Returns true if the recipe may have side-effects. 786 bool mayHaveSideEffects() const; 787 788 /// Returns true for PHI-like recipes. 789 bool isPhi() const { 790 return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC; 791 } 792 793 /// Returns true if the recipe may read from memory. 794 bool mayReadFromMemory() const; 795 796 /// Returns true if the recipe may write to memory. 797 bool mayWriteToMemory() const; 798 799 /// Returns true if the recipe may read from or write to memory. 800 bool mayReadOrWriteMemory() const { 801 return mayReadFromMemory() || mayWriteToMemory(); 802 } 803 804 /// Returns the debug location of the recipe. 805 DebugLoc getDebugLoc() const { return DL; } 806 807 protected: 808 /// Compute the cost of this recipe either using a recipe's specialized 809 /// implementation or using the legacy cost model and the underlying 810 /// instructions. 811 virtual InstructionCost computeCost(ElementCount VF, 812 VPCostContext &Ctx) const; 813 }; 814 815 // Helper macro to define common classof implementations for recipes. 816 #define VP_CLASSOF_IMPL(VPDefID) \ 817 static inline bool classof(const VPDef *D) { \ 818 return D->getVPDefID() == VPDefID; \ 819 } \ 820 static inline bool classof(const VPValue *V) { \ 821 auto *R = V->getDefiningRecipe(); \ 822 return R && R->getVPDefID() == VPDefID; \ 823 } \ 824 static inline bool classof(const VPUser *U) { \ 825 auto *R = dyn_cast<VPRecipeBase>(U); \ 826 return R && R->getVPDefID() == VPDefID; \ 827 } \ 828 static inline bool classof(const VPRecipeBase *R) { \ 829 return R->getVPDefID() == VPDefID; \ 830 } \ 831 static inline bool classof(const VPSingleDefRecipe *R) { \ 832 return R->getVPDefID() == VPDefID; \ 833 } 834 835 /// VPSingleDef is a base class for recipes for modeling a sequence of one or 836 /// more output IR that define a single result VPValue. 837 /// Note that VPRecipeBase must be inherited from before VPValue. 838 class VPSingleDefRecipe : public VPRecipeBase, public VPValue { 839 public: 840 template <typename IterT> 841 VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {}) 842 : VPRecipeBase(SC, Operands, DL), VPValue(this) {} 843 844 VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands, 845 DebugLoc DL = {}) 846 : VPRecipeBase(SC, Operands, DL), VPValue(this) {} 847 848 template <typename IterT> 849 VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV, 850 DebugLoc DL = {}) 851 : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {} 852 853 static inline bool classof(const VPRecipeBase *R) { 854 switch (R->getVPDefID()) { 855 case VPRecipeBase::VPDerivedIVSC: 856 case VPRecipeBase::VPEVLBasedIVPHISC: 857 case VPRecipeBase::VPExpandSCEVSC: 858 case VPRecipeBase::VPInstructionSC: 859 case VPRecipeBase::VPReductionEVLSC: 860 case VPRecipeBase::VPReductionSC: 861 case VPRecipeBase::VPReplicateSC: 862 case VPRecipeBase::VPScalarIVStepsSC: 863 case VPRecipeBase::VPVectorPointerSC: 864 case VPRecipeBase::VPReverseVectorPointerSC: 865 case VPRecipeBase::VPWidenCallSC: 866 case VPRecipeBase::VPWidenCanonicalIVSC: 867 case VPRecipeBase::VPWidenCastSC: 868 case VPRecipeBase::VPWidenGEPSC: 869 case VPRecipeBase::VPWidenIntrinsicSC: 870 case VPRecipeBase::VPWidenSC: 871 case VPRecipeBase::VPWidenEVLSC: 872 case VPRecipeBase::VPWidenSelectSC: 873 case VPRecipeBase::VPBlendSC: 874 case VPRecipeBase::VPPredInstPHISC: 875 case VPRecipeBase::VPCanonicalIVPHISC: 876 case VPRecipeBase::VPActiveLaneMaskPHISC: 877 case VPRecipeBase::VPFirstOrderRecurrencePHISC: 878 case VPRecipeBase::VPWidenPHISC: 879 case VPRecipeBase::VPWidenIntOrFpInductionSC: 880 case VPRecipeBase::VPWidenPointerInductionSC: 881 case VPRecipeBase::VPReductionPHISC: 882 case VPRecipeBase::VPScalarCastSC: 883 case VPRecipeBase::VPPartialReductionSC: 884 return true; 885 case VPRecipeBase::VPBranchOnMaskSC: 886 case VPRecipeBase::VPInterleaveSC: 887 case VPRecipeBase::VPIRInstructionSC: 888 case VPRecipeBase::VPWidenLoadEVLSC: 889 case VPRecipeBase::VPWidenLoadSC: 890 case VPRecipeBase::VPWidenStoreEVLSC: 891 case VPRecipeBase::VPWidenStoreSC: 892 case VPRecipeBase::VPHistogramSC: 893 // TODO: Widened stores don't define a value, but widened loads do. Split 894 // the recipes to be able to make widened loads VPSingleDefRecipes. 895 return false; 896 } 897 llvm_unreachable("Unhandled VPDefID"); 898 } 899 900 static inline bool classof(const VPUser *U) { 901 auto *R = dyn_cast<VPRecipeBase>(U); 902 return R && classof(R); 903 } 904 905 virtual VPSingleDefRecipe *clone() override = 0; 906 907 /// Returns the underlying instruction. 908 Instruction *getUnderlyingInstr() { 909 return cast<Instruction>(getUnderlyingValue()); 910 } 911 const Instruction *getUnderlyingInstr() const { 912 return cast<Instruction>(getUnderlyingValue()); 913 } 914 915 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 916 /// Print this VPSingleDefRecipe to dbgs() (for debugging). 917 LLVM_DUMP_METHOD void dump() const; 918 #endif 919 }; 920 921 /// Class to record LLVM IR flag for a recipe along with it. 922 class VPRecipeWithIRFlags : public VPSingleDefRecipe { 923 enum class OperationType : unsigned char { 924 Cmp, 925 OverflowingBinOp, 926 DisjointOp, 927 PossiblyExactOp, 928 GEPOp, 929 FPMathOp, 930 NonNegOp, 931 Other 932 }; 933 934 public: 935 struct WrapFlagsTy { 936 char HasNUW : 1; 937 char HasNSW : 1; 938 939 WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {} 940 }; 941 942 struct DisjointFlagsTy { 943 char IsDisjoint : 1; 944 DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} 945 }; 946 947 private: 948 struct ExactFlagsTy { 949 char IsExact : 1; 950 }; 951 struct NonNegFlagsTy { 952 char NonNeg : 1; 953 }; 954 struct FastMathFlagsTy { 955 char AllowReassoc : 1; 956 char NoNaNs : 1; 957 char NoInfs : 1; 958 char NoSignedZeros : 1; 959 char AllowReciprocal : 1; 960 char AllowContract : 1; 961 char ApproxFunc : 1; 962 963 FastMathFlagsTy(const FastMathFlags &FMF); 964 }; 965 966 OperationType OpType; 967 968 union { 969 CmpInst::Predicate CmpPredicate; 970 WrapFlagsTy WrapFlags; 971 DisjointFlagsTy DisjointFlags; 972 ExactFlagsTy ExactFlags; 973 GEPNoWrapFlags GEPFlags; 974 NonNegFlagsTy NonNegFlags; 975 FastMathFlagsTy FMFs; 976 unsigned AllFlags; 977 }; 978 979 protected: 980 void transferFlags(VPRecipeWithIRFlags &Other) { 981 OpType = Other.OpType; 982 AllFlags = Other.AllFlags; 983 } 984 985 public: 986 template <typename IterT> 987 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {}) 988 : VPSingleDefRecipe(SC, Operands, DL) { 989 OpType = OperationType::Other; 990 AllFlags = 0; 991 } 992 993 template <typename IterT> 994 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I) 995 : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) { 996 if (auto *Op = dyn_cast<CmpInst>(&I)) { 997 OpType = OperationType::Cmp; 998 CmpPredicate = Op->getPredicate(); 999 } else if (auto *Op = dyn_cast<PossiblyDisjointInst>(&I)) { 1000 OpType = OperationType::DisjointOp; 1001 DisjointFlags.IsDisjoint = Op->isDisjoint(); 1002 } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) { 1003 OpType = OperationType::OverflowingBinOp; 1004 WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()}; 1005 } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) { 1006 OpType = OperationType::PossiblyExactOp; 1007 ExactFlags.IsExact = Op->isExact(); 1008 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 1009 OpType = OperationType::GEPOp; 1010 GEPFlags = GEP->getNoWrapFlags(); 1011 } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) { 1012 OpType = OperationType::NonNegOp; 1013 NonNegFlags.NonNeg = PNNI->hasNonNeg(); 1014 } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) { 1015 OpType = OperationType::FPMathOp; 1016 FMFs = Op->getFastMathFlags(); 1017 } else { 1018 OpType = OperationType::Other; 1019 AllFlags = 0; 1020 } 1021 } 1022 1023 template <typename IterT> 1024 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1025 CmpInst::Predicate Pred, DebugLoc DL = {}) 1026 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp), 1027 CmpPredicate(Pred) {} 1028 1029 template <typename IterT> 1030 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1031 WrapFlagsTy WrapFlags, DebugLoc DL = {}) 1032 : VPSingleDefRecipe(SC, Operands, DL), 1033 OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {} 1034 1035 template <typename IterT> 1036 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1037 FastMathFlags FMFs, DebugLoc DL = {}) 1038 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp), 1039 FMFs(FMFs) {} 1040 1041 template <typename IterT> 1042 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1043 DisjointFlagsTy DisjointFlags, DebugLoc DL = {}) 1044 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), 1045 DisjointFlags(DisjointFlags) {} 1046 1047 protected: 1048 template <typename IterT> 1049 VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, 1050 GEPNoWrapFlags GEPFlags, DebugLoc DL = {}) 1051 : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp), 1052 GEPFlags(GEPFlags) {} 1053 1054 public: 1055 static inline bool classof(const VPRecipeBase *R) { 1056 return R->getVPDefID() == VPRecipeBase::VPInstructionSC || 1057 R->getVPDefID() == VPRecipeBase::VPWidenSC || 1058 R->getVPDefID() == VPRecipeBase::VPWidenEVLSC || 1059 R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || 1060 R->getVPDefID() == VPRecipeBase::VPWidenCastSC || 1061 R->getVPDefID() == VPRecipeBase::VPReplicateSC || 1062 R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC || 1063 R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; 1064 } 1065 1066 static inline bool classof(const VPUser *U) { 1067 auto *R = dyn_cast<VPRecipeBase>(U); 1068 return R && classof(R); 1069 } 1070 1071 /// Drop all poison-generating flags. 1072 void dropPoisonGeneratingFlags() { 1073 // NOTE: This needs to be kept in-sync with 1074 // Instruction::dropPoisonGeneratingFlags. 1075 switch (OpType) { 1076 case OperationType::OverflowingBinOp: 1077 WrapFlags.HasNUW = false; 1078 WrapFlags.HasNSW = false; 1079 break; 1080 case OperationType::DisjointOp: 1081 DisjointFlags.IsDisjoint = false; 1082 break; 1083 case OperationType::PossiblyExactOp: 1084 ExactFlags.IsExact = false; 1085 break; 1086 case OperationType::GEPOp: 1087 GEPFlags = GEPNoWrapFlags::none(); 1088 break; 1089 case OperationType::FPMathOp: 1090 FMFs.NoNaNs = false; 1091 FMFs.NoInfs = false; 1092 break; 1093 case OperationType::NonNegOp: 1094 NonNegFlags.NonNeg = false; 1095 break; 1096 case OperationType::Cmp: 1097 case OperationType::Other: 1098 break; 1099 } 1100 } 1101 1102 /// Set the IR flags for \p I. 1103 void setFlags(Instruction *I) const { 1104 switch (OpType) { 1105 case OperationType::OverflowingBinOp: 1106 I->setHasNoUnsignedWrap(WrapFlags.HasNUW); 1107 I->setHasNoSignedWrap(WrapFlags.HasNSW); 1108 break; 1109 case OperationType::DisjointOp: 1110 cast<PossiblyDisjointInst>(I)->setIsDisjoint(DisjointFlags.IsDisjoint); 1111 break; 1112 case OperationType::PossiblyExactOp: 1113 I->setIsExact(ExactFlags.IsExact); 1114 break; 1115 case OperationType::GEPOp: 1116 cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags); 1117 break; 1118 case OperationType::FPMathOp: 1119 I->setHasAllowReassoc(FMFs.AllowReassoc); 1120 I->setHasNoNaNs(FMFs.NoNaNs); 1121 I->setHasNoInfs(FMFs.NoInfs); 1122 I->setHasNoSignedZeros(FMFs.NoSignedZeros); 1123 I->setHasAllowReciprocal(FMFs.AllowReciprocal); 1124 I->setHasAllowContract(FMFs.AllowContract); 1125 I->setHasApproxFunc(FMFs.ApproxFunc); 1126 break; 1127 case OperationType::NonNegOp: 1128 I->setNonNeg(NonNegFlags.NonNeg); 1129 break; 1130 case OperationType::Cmp: 1131 case OperationType::Other: 1132 break; 1133 } 1134 } 1135 1136 CmpInst::Predicate getPredicate() const { 1137 assert(OpType == OperationType::Cmp && 1138 "recipe doesn't have a compare predicate"); 1139 return CmpPredicate; 1140 } 1141 1142 GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } 1143 1144 /// Returns true if the recipe has fast-math flags. 1145 bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } 1146 1147 FastMathFlags getFastMathFlags() const; 1148 1149 bool hasNoUnsignedWrap() const { 1150 assert(OpType == OperationType::OverflowingBinOp && 1151 "recipe doesn't have a NUW flag"); 1152 return WrapFlags.HasNUW; 1153 } 1154 1155 bool hasNoSignedWrap() const { 1156 assert(OpType == OperationType::OverflowingBinOp && 1157 "recipe doesn't have a NSW flag"); 1158 return WrapFlags.HasNSW; 1159 } 1160 1161 bool isDisjoint() const { 1162 assert(OpType == OperationType::DisjointOp && 1163 "recipe cannot have a disjoing flag"); 1164 return DisjointFlags.IsDisjoint; 1165 } 1166 1167 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1168 void printFlags(raw_ostream &O) const; 1169 #endif 1170 }; 1171 1172 /// Helper to access the operand that contains the unroll part for this recipe 1173 /// after unrolling. 1174 template <unsigned PartOpIdx> class VPUnrollPartAccessor { 1175 protected: 1176 /// Return the VPValue operand containing the unroll part or null if there is 1177 /// no such operand. 1178 VPValue *getUnrollPartOperand(VPUser &U) const; 1179 1180 /// Return the unroll part. 1181 unsigned getUnrollPart(VPUser &U) const; 1182 }; 1183 1184 /// This is a concrete Recipe that models a single VPlan-level instruction. 1185 /// While as any Recipe it may generate a sequence of IR instructions when 1186 /// executed, these instructions would always form a single-def expression as 1187 /// the VPInstruction is also a single def-use vertex. 1188 class VPInstruction : public VPRecipeWithIRFlags, 1189 public VPUnrollPartAccessor<1> { 1190 friend class VPlanSlp; 1191 1192 public: 1193 /// VPlan opcodes, extending LLVM IR with idiomatics instructions. 1194 enum { 1195 FirstOrderRecurrenceSplice = 1196 Instruction::OtherOpsEnd + 1, // Combines the incoming and previous 1197 // values of a first-order recurrence. 1198 Not, 1199 SLPLoad, 1200 SLPStore, 1201 ActiveLaneMask, 1202 ExplicitVectorLength, 1203 /// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan. 1204 /// The first operand is the incoming value from the predecessor in VPlan, 1205 /// the second operand is the incoming value for all other predecessors 1206 /// (which are currently not modeled in VPlan). 1207 ResumePhi, 1208 CalculateTripCountMinusVF, 1209 // Increment the canonical IV separately for each unrolled part. 1210 CanonicalIVIncrementForPart, 1211 BranchOnCount, 1212 BranchOnCond, 1213 ComputeReductionResult, 1214 // Takes the VPValue to extract from as first operand and the lane or part 1215 // to extract as second operand, counting from the end starting with 1 for 1216 // last. The second operand must be a positive constant and <= VF. 1217 ExtractFromEnd, 1218 LogicalAnd, // Non-poison propagating logical And. 1219 // Add an offset in bytes (second operand) to a base pointer (first 1220 // operand). Only generates scalar values (either for the first lane only or 1221 // for all lanes, depending on its uses). 1222 PtrAdd, 1223 // Returns a scalar boolean value, which is true if any lane of its (only 1224 // boolean) vector operand is true. 1225 AnyOf, 1226 }; 1227 1228 private: 1229 typedef unsigned char OpcodeTy; 1230 OpcodeTy Opcode; 1231 1232 /// An optional name that can be used for the generated IR instruction. 1233 const std::string Name; 1234 1235 /// Returns true if this VPInstruction generates scalar values for all lanes. 1236 /// Most VPInstructions generate a single value per part, either vector or 1237 /// scalar. VPReplicateRecipe takes care of generating multiple (scalar) 1238 /// values per all lanes, stemming from an original ingredient. This method 1239 /// identifies the (rare) cases of VPInstructions that do so as well, w/o an 1240 /// underlying ingredient. 1241 bool doesGeneratePerAllLanes() const; 1242 1243 /// Returns true if we can generate a scalar for the first lane only if 1244 /// needed. 1245 bool canGenerateScalarForFirstLane() const; 1246 1247 /// Utility methods serving execute(): generates a single vector instance of 1248 /// the modeled instruction. \returns the generated value. . In some cases an 1249 /// existing value is returned rather than a generated one. 1250 Value *generate(VPTransformState &State); 1251 1252 /// Utility methods serving execute(): generates a scalar single instance of 1253 /// the modeled instruction for a given lane. \returns the scalar generated 1254 /// value for lane \p Lane. 1255 Value *generatePerLane(VPTransformState &State, const VPLane &Lane); 1256 1257 #if !defined(NDEBUG) 1258 /// Return true if the VPInstruction is a floating point math operation, i.e. 1259 /// has fast-math flags. 1260 bool isFPMathOp() const; 1261 #endif 1262 1263 public: 1264 VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, 1265 const Twine &Name = "") 1266 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), 1267 Opcode(Opcode), Name(Name.str()) {} 1268 1269 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1270 DebugLoc DL = {}, const Twine &Name = "") 1271 : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} 1272 1273 VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, 1274 VPValue *B, DebugLoc DL = {}, const Twine &Name = ""); 1275 1276 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1277 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "") 1278 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL), 1279 Opcode(Opcode), Name(Name.str()) {} 1280 1281 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1282 DisjointFlagsTy DisjointFlag, DebugLoc DL = {}, 1283 const Twine &Name = "") 1284 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL), 1285 Opcode(Opcode), Name(Name.str()) { 1286 assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint"); 1287 } 1288 1289 VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags, 1290 DebugLoc DL = {}, const Twine &Name = "") 1291 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, 1292 ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL), 1293 Opcode(VPInstruction::PtrAdd), Name(Name.str()) {} 1294 1295 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, 1296 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = ""); 1297 1298 VP_CLASSOF_IMPL(VPDef::VPInstructionSC) 1299 1300 VPInstruction *clone() override { 1301 SmallVector<VPValue *, 2> Operands(operands()); 1302 auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name); 1303 New->transferFlags(*this); 1304 return New; 1305 } 1306 1307 unsigned getOpcode() const { return Opcode; } 1308 1309 /// Generate the instruction. 1310 /// TODO: We currently execute only per-part unless a specific instance is 1311 /// provided. 1312 void execute(VPTransformState &State) override; 1313 1314 /// Return the cost of this VPInstruction. 1315 InstructionCost computeCost(ElementCount VF, 1316 VPCostContext &Ctx) const override { 1317 // TODO: Compute accurate cost after retiring the legacy cost model. 1318 return 0; 1319 } 1320 1321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1322 /// Print the VPInstruction to \p O. 1323 void print(raw_ostream &O, const Twine &Indent, 1324 VPSlotTracker &SlotTracker) const override; 1325 1326 /// Print the VPInstruction to dbgs() (for debugging). 1327 LLVM_DUMP_METHOD void dump() const; 1328 #endif 1329 1330 bool hasResult() const { 1331 // CallInst may or may not have a result, depending on the called function. 1332 // Conservatively return calls have results for now. 1333 switch (getOpcode()) { 1334 case Instruction::Ret: 1335 case Instruction::Br: 1336 case Instruction::Store: 1337 case Instruction::Switch: 1338 case Instruction::IndirectBr: 1339 case Instruction::Resume: 1340 case Instruction::CatchRet: 1341 case Instruction::Unreachable: 1342 case Instruction::Fence: 1343 case Instruction::AtomicRMW: 1344 case VPInstruction::BranchOnCond: 1345 case VPInstruction::BranchOnCount: 1346 return false; 1347 default: 1348 return true; 1349 } 1350 } 1351 1352 /// Returns true if the underlying opcode may read from or write to memory. 1353 bool opcodeMayReadOrWriteFromMemory() const; 1354 1355 /// Returns true if the recipe only uses the first lane of operand \p Op. 1356 bool onlyFirstLaneUsed(const VPValue *Op) const override; 1357 1358 /// Returns true if the recipe only uses the first part of operand \p Op. 1359 bool onlyFirstPartUsed(const VPValue *Op) const override; 1360 1361 /// Returns true if this VPInstruction produces a scalar value from a vector, 1362 /// e.g. by performing a reduction or extracting a lane. 1363 bool isVectorToScalar() const; 1364 1365 /// Returns true if this VPInstruction's operands are single scalars and the 1366 /// result is also a single scalar. 1367 bool isSingleScalar() const; 1368 1369 /// Returns the symbolic name assigned to the VPInstruction. 1370 StringRef getName() const { return Name; } 1371 }; 1372 1373 /// A recipe to wrap on original IR instruction not to be modified during 1374 /// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed, 1375 /// and it is used to add a new incoming value for the single predecessor VPBB. 1376 /// Expect PHIs, VPIRInstructions cannot have any operands. 1377 class VPIRInstruction : public VPRecipeBase { 1378 Instruction &I; 1379 1380 public: 1381 VPIRInstruction(Instruction &I) 1382 : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef<VPValue *>()), I(I) {} 1383 1384 ~VPIRInstruction() override = default; 1385 1386 VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC) 1387 1388 VPIRInstruction *clone() override { 1389 auto *R = new VPIRInstruction(I); 1390 for (auto *Op : operands()) 1391 R->addOperand(Op); 1392 return R; 1393 } 1394 1395 void execute(VPTransformState &State) override; 1396 1397 /// Return the cost of this VPIRInstruction. 1398 InstructionCost computeCost(ElementCount VF, 1399 VPCostContext &Ctx) const override; 1400 1401 Instruction &getInstruction() const { return I; } 1402 1403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1404 /// Print the recipe. 1405 void print(raw_ostream &O, const Twine &Indent, 1406 VPSlotTracker &SlotTracker) const override; 1407 #endif 1408 1409 bool usesScalars(const VPValue *Op) const override { 1410 assert(is_contained(operands(), Op) && 1411 "Op must be an operand of the recipe"); 1412 return true; 1413 } 1414 1415 bool onlyFirstPartUsed(const VPValue *Op) const override { 1416 assert(is_contained(operands(), Op) && 1417 "Op must be an operand of the recipe"); 1418 return true; 1419 } 1420 1421 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1422 assert(is_contained(operands(), Op) && 1423 "Op must be an operand of the recipe"); 1424 return true; 1425 } 1426 1427 /// Update the recipes single operand to the last lane of the operand using \p 1428 /// Builder. Must only be used for single operand VPIRInstructions wrapping a 1429 /// PHINode. 1430 void extractLastLaneOfOperand(VPBuilder &Builder); 1431 }; 1432 1433 /// VPWidenRecipe is a recipe for producing a widened instruction using the 1434 /// opcode and operands of the recipe. This recipe covers most of the 1435 /// traditional vectorization cases where each recipe transforms into a 1436 /// vectorized version of itself. 1437 class VPWidenRecipe : public VPRecipeWithIRFlags { 1438 unsigned Opcode; 1439 1440 protected: 1441 template <typename IterT> 1442 VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, 1443 iterator_range<IterT> Operands) 1444 : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} 1445 1446 public: 1447 template <typename IterT> 1448 VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) 1449 : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} 1450 1451 ~VPWidenRecipe() override = default; 1452 1453 VPWidenRecipe *clone() override { 1454 auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); 1455 R->transferFlags(*this); 1456 return R; 1457 } 1458 1459 static inline bool classof(const VPRecipeBase *R) { 1460 return R->getVPDefID() == VPRecipeBase::VPWidenSC || 1461 R->getVPDefID() == VPRecipeBase::VPWidenEVLSC; 1462 } 1463 1464 static inline bool classof(const VPUser *U) { 1465 auto *R = dyn_cast<VPRecipeBase>(U); 1466 return R && classof(R); 1467 } 1468 1469 /// Produce a widened instruction using the opcode and operands of the recipe, 1470 /// processing State.VF elements. 1471 void execute(VPTransformState &State) override; 1472 1473 /// Return the cost of this VPWidenRecipe. 1474 InstructionCost computeCost(ElementCount VF, 1475 VPCostContext &Ctx) const override; 1476 1477 unsigned getOpcode() const { return Opcode; } 1478 1479 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1480 /// Print the recipe. 1481 void print(raw_ostream &O, const Twine &Indent, 1482 VPSlotTracker &SlotTracker) const override; 1483 #endif 1484 }; 1485 1486 /// A recipe for widening operations with vector-predication intrinsics with 1487 /// explicit vector length (EVL). 1488 class VPWidenEVLRecipe : public VPWidenRecipe { 1489 using VPRecipeWithIRFlags::transferFlags; 1490 1491 public: 1492 template <typename IterT> 1493 VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL) 1494 : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { 1495 addOperand(&EVL); 1496 } 1497 VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL) 1498 : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) { 1499 transferFlags(W); 1500 } 1501 1502 ~VPWidenEVLRecipe() override = default; 1503 1504 VPWidenRecipe *clone() override final { 1505 llvm_unreachable("VPWidenEVLRecipe cannot be cloned"); 1506 return nullptr; 1507 } 1508 1509 VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC); 1510 1511 VPValue *getEVL() { return getOperand(getNumOperands() - 1); } 1512 const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } 1513 1514 /// Produce a vp-intrinsic using the opcode and operands of the recipe, 1515 /// processing EVL elements. 1516 void execute(VPTransformState &State) override final; 1517 1518 /// Returns true if the recipe only uses the first lane of operand \p Op. 1519 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1520 assert(is_contained(operands(), Op) && 1521 "Op must be an operand of the recipe"); 1522 // EVL in that recipe is always the last operand, thus any use before means 1523 // the VPValue should be vectorized. 1524 return getEVL() == Op; 1525 } 1526 1527 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1528 /// Print the recipe. 1529 void print(raw_ostream &O, const Twine &Indent, 1530 VPSlotTracker &SlotTracker) const override final; 1531 #endif 1532 }; 1533 1534 /// VPWidenCastRecipe is a recipe to create vector cast instructions. 1535 class VPWidenCastRecipe : public VPRecipeWithIRFlags { 1536 /// Cast instruction opcode. 1537 Instruction::CastOps Opcode; 1538 1539 /// Result type for the cast. 1540 Type *ResultTy; 1541 1542 public: 1543 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, 1544 CastInst &UI) 1545 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode), 1546 ResultTy(ResultTy) { 1547 assert(UI.getOpcode() == Opcode && 1548 "opcode of underlying cast doesn't match"); 1549 } 1550 1551 VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) 1552 : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode), 1553 ResultTy(ResultTy) {} 1554 1555 ~VPWidenCastRecipe() override = default; 1556 1557 VPWidenCastRecipe *clone() override { 1558 if (auto *UV = getUnderlyingValue()) 1559 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, 1560 *cast<CastInst>(UV)); 1561 1562 return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy); 1563 } 1564 1565 VP_CLASSOF_IMPL(VPDef::VPWidenCastSC) 1566 1567 /// Produce widened copies of the cast. 1568 void execute(VPTransformState &State) override; 1569 1570 /// Return the cost of this VPWidenCastRecipe. 1571 InstructionCost computeCost(ElementCount VF, 1572 VPCostContext &Ctx) const override; 1573 1574 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1575 /// Print the recipe. 1576 void print(raw_ostream &O, const Twine &Indent, 1577 VPSlotTracker &SlotTracker) const override; 1578 #endif 1579 1580 Instruction::CastOps getOpcode() const { return Opcode; } 1581 1582 /// Returns the result type of the cast. 1583 Type *getResultType() const { return ResultTy; } 1584 }; 1585 1586 /// VPScalarCastRecipe is a recipe to create scalar cast instructions. 1587 class VPScalarCastRecipe : public VPSingleDefRecipe { 1588 Instruction::CastOps Opcode; 1589 1590 Type *ResultTy; 1591 1592 Value *generate(VPTransformState &State); 1593 1594 public: 1595 VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, 1596 DebugLoc DL) 1597 : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode), 1598 ResultTy(ResultTy) {} 1599 1600 ~VPScalarCastRecipe() override = default; 1601 1602 VPScalarCastRecipe *clone() override { 1603 return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy, 1604 getDebugLoc()); 1605 } 1606 1607 VP_CLASSOF_IMPL(VPDef::VPScalarCastSC) 1608 1609 void execute(VPTransformState &State) override; 1610 1611 /// Return the cost of this VPScalarCastRecipe. 1612 InstructionCost computeCost(ElementCount VF, 1613 VPCostContext &Ctx) const override { 1614 // TODO: Compute accurate cost after retiring the legacy cost model. 1615 return 0; 1616 } 1617 1618 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1619 void print(raw_ostream &O, const Twine &Indent, 1620 VPSlotTracker &SlotTracker) const override; 1621 #endif 1622 1623 /// Returns the result type of the cast. 1624 Type *getResultType() const { return ResultTy; } 1625 1626 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1627 // At the moment, only uniform codegen is implemented. 1628 assert(is_contained(operands(), Op) && 1629 "Op must be an operand of the recipe"); 1630 return true; 1631 } 1632 }; 1633 1634 /// A recipe for widening vector intrinsics. 1635 class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { 1636 /// ID of the vector intrinsic to widen. 1637 Intrinsic::ID VectorIntrinsicID; 1638 1639 /// Scalar return type of the intrinsic. 1640 Type *ResultTy; 1641 1642 /// True if the intrinsic may read from memory. 1643 bool MayReadFromMemory; 1644 1645 /// True if the intrinsic may read write to memory. 1646 bool MayWriteToMemory; 1647 1648 /// True if the intrinsic may have side-effects. 1649 bool MayHaveSideEffects; 1650 1651 public: 1652 VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, 1653 ArrayRef<VPValue *> CallArguments, Type *Ty, 1654 DebugLoc DL = {}) 1655 : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), 1656 VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), 1657 MayReadFromMemory(CI.mayReadFromMemory()), 1658 MayWriteToMemory(CI.mayWriteToMemory()), 1659 MayHaveSideEffects(CI.mayHaveSideEffects()) {} 1660 1661 VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, 1662 ArrayRef<VPValue *> CallArguments, Type *Ty, 1663 DebugLoc DL = {}) 1664 : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), 1665 VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { 1666 LLVMContext &Ctx = Ty->getContext(); 1667 AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); 1668 MemoryEffects ME = Attrs.getMemoryEffects(); 1669 MayReadFromMemory = ME.onlyWritesMemory(); 1670 MayWriteToMemory = ME.onlyReadsMemory(); 1671 MayHaveSideEffects = MayWriteToMemory || 1672 !Attrs.hasFnAttr(Attribute::NoUnwind) || 1673 !Attrs.hasFnAttr(Attribute::WillReturn); 1674 } 1675 1676 VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, 1677 std::initializer_list<VPValue *> CallArguments, 1678 Type *Ty, DebugLoc DL = {}) 1679 : VPWidenIntrinsicRecipe(VectorIntrinsicID, 1680 ArrayRef<VPValue *>(CallArguments), Ty, DL) {} 1681 1682 ~VPWidenIntrinsicRecipe() override = default; 1683 1684 VPWidenIntrinsicRecipe *clone() override { 1685 return new VPWidenIntrinsicRecipe(*cast<CallInst>(getUnderlyingValue()), 1686 VectorIntrinsicID, {op_begin(), op_end()}, 1687 ResultTy, getDebugLoc()); 1688 } 1689 1690 VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC) 1691 1692 /// Produce a widened version of the vector intrinsic. 1693 void execute(VPTransformState &State) override; 1694 1695 /// Return the cost of this vector intrinsic. 1696 InstructionCost computeCost(ElementCount VF, 1697 VPCostContext &Ctx) const override; 1698 1699 /// Return the ID of the intrinsic. 1700 Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; } 1701 1702 /// Return the scalar return type of the intrinsic. 1703 Type *getResultType() const { return ResultTy; } 1704 1705 /// Return to name of the intrinsic as string. 1706 StringRef getIntrinsicName() const; 1707 1708 /// Returns true if the intrinsic may read from memory. 1709 bool mayReadFromMemory() const { return MayReadFromMemory; } 1710 1711 /// Returns true if the intrinsic may write to memory. 1712 bool mayWriteToMemory() const { return MayWriteToMemory; } 1713 1714 /// Returns true if the intrinsic may have side-effects. 1715 bool mayHaveSideEffects() const { return MayHaveSideEffects; } 1716 1717 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1718 /// Print the recipe. 1719 void print(raw_ostream &O, const Twine &Indent, 1720 VPSlotTracker &SlotTracker) const override; 1721 #endif 1722 1723 bool onlyFirstLaneUsed(const VPValue *Op) const override; 1724 }; 1725 1726 /// A recipe for widening Call instructions using library calls. 1727 class VPWidenCallRecipe : public VPRecipeWithIRFlags { 1728 /// Variant stores a pointer to the chosen function. There is a 1:1 mapping 1729 /// between a given VF and the chosen vectorized variant, so there will be a 1730 /// different VPlan for each VF with a valid variant. 1731 Function *Variant; 1732 1733 public: 1734 VPWidenCallRecipe(Value *UV, Function *Variant, 1735 ArrayRef<VPValue *> CallArguments, DebugLoc DL = {}) 1736 : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, 1737 *cast<Instruction>(UV)), 1738 Variant(Variant) { 1739 assert( 1740 isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) && 1741 "last operand must be the called function"); 1742 } 1743 1744 ~VPWidenCallRecipe() override = default; 1745 1746 VPWidenCallRecipe *clone() override { 1747 return new VPWidenCallRecipe(getUnderlyingValue(), Variant, 1748 {op_begin(), op_end()}, getDebugLoc()); 1749 } 1750 1751 VP_CLASSOF_IMPL(VPDef::VPWidenCallSC) 1752 1753 /// Produce a widened version of the call instruction. 1754 void execute(VPTransformState &State) override; 1755 1756 /// Return the cost of this VPWidenCallRecipe. 1757 InstructionCost computeCost(ElementCount VF, 1758 VPCostContext &Ctx) const override; 1759 1760 Function *getCalledScalarFunction() const { 1761 return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); 1762 } 1763 1764 operand_range arg_operands() { 1765 return make_range(op_begin(), op_begin() + getNumOperands() - 1); 1766 } 1767 const_operand_range arg_operands() const { 1768 return make_range(op_begin(), op_begin() + getNumOperands() - 1); 1769 } 1770 1771 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1772 /// Print the recipe. 1773 void print(raw_ostream &O, const Twine &Indent, 1774 VPSlotTracker &SlotTracker) const override; 1775 #endif 1776 }; 1777 1778 /// A recipe representing a sequence of load -> update -> store as part of 1779 /// a histogram operation. This means there may be aliasing between vector 1780 /// lanes, which is handled by the llvm.experimental.vector.histogram family 1781 /// of intrinsics. The only update operations currently supported are 1782 /// 'add' and 'sub' where the other term is loop-invariant. 1783 class VPHistogramRecipe : public VPRecipeBase { 1784 /// Opcode of the update operation, currently either add or sub. 1785 unsigned Opcode; 1786 1787 public: 1788 template <typename IterT> 1789 VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands, 1790 DebugLoc DL = {}) 1791 : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {} 1792 1793 ~VPHistogramRecipe() override = default; 1794 1795 VPHistogramRecipe *clone() override { 1796 return new VPHistogramRecipe(Opcode, operands(), getDebugLoc()); 1797 } 1798 1799 VP_CLASSOF_IMPL(VPDef::VPHistogramSC); 1800 1801 /// Produce a vectorized histogram operation. 1802 void execute(VPTransformState &State) override; 1803 1804 /// Return the cost of this VPHistogramRecipe. 1805 InstructionCost computeCost(ElementCount VF, 1806 VPCostContext &Ctx) const override; 1807 1808 unsigned getOpcode() const { return Opcode; } 1809 1810 /// Return the mask operand if one was provided, or a null pointer if all 1811 /// lanes should be executed unconditionally. 1812 VPValue *getMask() const { 1813 return getNumOperands() == 3 ? getOperand(2) : nullptr; 1814 } 1815 1816 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1817 /// Print the recipe 1818 void print(raw_ostream &O, const Twine &Indent, 1819 VPSlotTracker &SlotTracker) const override; 1820 #endif 1821 }; 1822 1823 /// A recipe for widening select instructions. 1824 struct VPWidenSelectRecipe : public VPRecipeWithIRFlags { 1825 template <typename IterT> 1826 VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands) 1827 : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I) {} 1828 1829 ~VPWidenSelectRecipe() override = default; 1830 1831 VPWidenSelectRecipe *clone() override { 1832 return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()), 1833 operands()); 1834 } 1835 1836 VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC) 1837 1838 /// Produce a widened version of the select instruction. 1839 void execute(VPTransformState &State) override; 1840 1841 /// Return the cost of this VPWidenSelectRecipe. 1842 InstructionCost computeCost(ElementCount VF, 1843 VPCostContext &Ctx) const override; 1844 1845 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1846 /// Print the recipe. 1847 void print(raw_ostream &O, const Twine &Indent, 1848 VPSlotTracker &SlotTracker) const override; 1849 #endif 1850 1851 VPValue *getCond() const { 1852 return getOperand(0); 1853 } 1854 1855 bool isInvariantCond() const { 1856 return getCond()->isDefinedOutsideLoopRegions(); 1857 } 1858 }; 1859 1860 /// A recipe for handling GEP instructions. 1861 class VPWidenGEPRecipe : public VPRecipeWithIRFlags { 1862 bool isPointerLoopInvariant() const { 1863 return getOperand(0)->isDefinedOutsideLoopRegions(); 1864 } 1865 1866 bool isIndexLoopInvariant(unsigned I) const { 1867 return getOperand(I + 1)->isDefinedOutsideLoopRegions(); 1868 } 1869 1870 bool areAllOperandsInvariant() const { 1871 return all_of(operands(), [](VPValue *Op) { 1872 return Op->isDefinedOutsideLoopRegions(); 1873 }); 1874 } 1875 1876 public: 1877 template <typename IterT> 1878 VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) 1879 : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {} 1880 1881 ~VPWidenGEPRecipe() override = default; 1882 1883 VPWidenGEPRecipe *clone() override { 1884 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(getUnderlyingInstr()), 1885 operands()); 1886 } 1887 1888 VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC) 1889 1890 /// Generate the gep nodes. 1891 void execute(VPTransformState &State) override; 1892 1893 /// Return the cost of this VPWidenGEPRecipe. 1894 InstructionCost computeCost(ElementCount VF, 1895 VPCostContext &Ctx) const override { 1896 // TODO: Compute accurate cost after retiring the legacy cost model. 1897 return 0; 1898 } 1899 1900 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1901 /// Print the recipe. 1902 void print(raw_ostream &O, const Twine &Indent, 1903 VPSlotTracker &SlotTracker) const override; 1904 #endif 1905 }; 1906 1907 /// A recipe to compute the pointers for widened memory accesses of IndexTy 1908 /// in reverse order. 1909 class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags, 1910 public VPUnrollPartAccessor<2> { 1911 Type *IndexedTy; 1912 1913 public: 1914 VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, 1915 GEPNoWrapFlags GEPFlags, DebugLoc DL) 1916 : VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC, 1917 ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL), 1918 IndexedTy(IndexedTy) {} 1919 1920 VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC) 1921 1922 VPValue *getVFValue() { return getOperand(1); } 1923 const VPValue *getVFValue() const { return getOperand(1); } 1924 1925 void execute(VPTransformState &State) override; 1926 1927 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1928 assert(is_contained(operands(), Op) && 1929 "Op must be an operand of the recipe"); 1930 return true; 1931 } 1932 1933 /// Return the cost of this VPVectorPointerRecipe. 1934 InstructionCost computeCost(ElementCount VF, 1935 VPCostContext &Ctx) const override { 1936 // TODO: Compute accurate cost after retiring the legacy cost model. 1937 return 0; 1938 } 1939 1940 /// Returns true if the recipe only uses the first part of operand \p Op. 1941 bool onlyFirstPartUsed(const VPValue *Op) const override { 1942 assert(is_contained(operands(), Op) && 1943 "Op must be an operand of the recipe"); 1944 assert(getNumOperands() <= 2 && "must have at most two operands"); 1945 return true; 1946 } 1947 1948 VPReverseVectorPointerRecipe *clone() override { 1949 return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(), 1950 IndexedTy, getGEPNoWrapFlags(), 1951 getDebugLoc()); 1952 } 1953 1954 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1955 /// Print the recipe. 1956 void print(raw_ostream &O, const Twine &Indent, 1957 VPSlotTracker &SlotTracker) const override; 1958 #endif 1959 }; 1960 1961 /// A recipe to compute the pointers for widened memory accesses of IndexTy. 1962 class VPVectorPointerRecipe : public VPRecipeWithIRFlags, 1963 public VPUnrollPartAccessor<1> { 1964 Type *IndexedTy; 1965 1966 public: 1967 VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, 1968 DebugLoc DL) 1969 : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), 1970 GEPFlags, DL), 1971 IndexedTy(IndexedTy) {} 1972 1973 VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) 1974 1975 void execute(VPTransformState &State) override; 1976 1977 bool onlyFirstLaneUsed(const VPValue *Op) const override { 1978 assert(is_contained(operands(), Op) && 1979 "Op must be an operand of the recipe"); 1980 return true; 1981 } 1982 1983 /// Returns true if the recipe only uses the first part of operand \p Op. 1984 bool onlyFirstPartUsed(const VPValue *Op) const override { 1985 assert(is_contained(operands(), Op) && 1986 "Op must be an operand of the recipe"); 1987 assert(getNumOperands() <= 2 && "must have at most two operands"); 1988 return true; 1989 } 1990 1991 VPVectorPointerRecipe *clone() override { 1992 return new VPVectorPointerRecipe(getOperand(0), IndexedTy, 1993 getGEPNoWrapFlags(), getDebugLoc()); 1994 } 1995 1996 /// Return the cost of this VPHeaderPHIRecipe. 1997 InstructionCost computeCost(ElementCount VF, 1998 VPCostContext &Ctx) const override { 1999 // TODO: Compute accurate cost after retiring the legacy cost model. 2000 return 0; 2001 } 2002 2003 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2004 /// Print the recipe. 2005 void print(raw_ostream &O, const Twine &Indent, 2006 VPSlotTracker &SlotTracker) const override; 2007 #endif 2008 }; 2009 2010 /// A pure virtual base class for all recipes modeling header phis, including 2011 /// phis for first order recurrences, pointer inductions and reductions. The 2012 /// start value is the first operand of the recipe and the incoming value from 2013 /// the backedge is the second operand. 2014 /// 2015 /// Inductions are modeled using the following sub-classes: 2016 /// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop, 2017 /// starting at a specified value (zero for the main vector loop, the resume 2018 /// value for the epilogue vector loop) and stepping by 1. The induction 2019 /// controls exiting of the vector loop by comparing against the vector trip 2020 /// count. Produces a single scalar PHI for the induction value per 2021 /// iteration. 2022 /// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and 2023 /// floating point inductions with arbitrary start and step values. Produces 2024 /// a vector PHI per-part. 2025 /// * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding 2026 /// value of an IV with different start and step values. Produces a single 2027 /// scalar value per iteration 2028 /// * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a 2029 /// canonical or derived induction. 2030 /// * VPWidenPointerInductionRecipe: Generate vector and scalar values for a 2031 /// pointer induction. Produces either a vector PHI per-part or scalar values 2032 /// per-lane based on the canonical induction. 2033 class VPHeaderPHIRecipe : public VPSingleDefRecipe { 2034 protected: 2035 VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, 2036 VPValue *Start = nullptr, DebugLoc DL = {}) 2037 : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) { 2038 if (Start) 2039 addOperand(Start); 2040 } 2041 2042 public: 2043 ~VPHeaderPHIRecipe() override = default; 2044 2045 /// Method to support type inquiry through isa, cast, and dyn_cast. 2046 static inline bool classof(const VPRecipeBase *B) { 2047 return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC && 2048 B->getVPDefID() <= VPDef::VPLastHeaderPHISC; 2049 } 2050 static inline bool classof(const VPValue *V) { 2051 auto *B = V->getDefiningRecipe(); 2052 return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC && 2053 B->getVPDefID() <= VPRecipeBase::VPLastHeaderPHISC; 2054 } 2055 2056 /// Generate the phi nodes. 2057 void execute(VPTransformState &State) override = 0; 2058 2059 /// Return the cost of this header phi recipe. 2060 InstructionCost computeCost(ElementCount VF, 2061 VPCostContext &Ctx) const override; 2062 2063 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2064 /// Print the recipe. 2065 void print(raw_ostream &O, const Twine &Indent, 2066 VPSlotTracker &SlotTracker) const override = 0; 2067 #endif 2068 2069 /// Returns the start value of the phi, if one is set. 2070 VPValue *getStartValue() { 2071 return getNumOperands() == 0 ? nullptr : getOperand(0); 2072 } 2073 VPValue *getStartValue() const { 2074 return getNumOperands() == 0 ? nullptr : getOperand(0); 2075 } 2076 2077 /// Update the start value of the recipe. 2078 void setStartValue(VPValue *V) { setOperand(0, V); } 2079 2080 /// Returns the incoming value from the loop backedge. 2081 virtual VPValue *getBackedgeValue() { 2082 return getOperand(1); 2083 } 2084 2085 /// Returns the backedge value as a recipe. The backedge value is guaranteed 2086 /// to be a recipe. 2087 virtual VPRecipeBase &getBackedgeRecipe() { 2088 return *getBackedgeValue()->getDefiningRecipe(); 2089 } 2090 }; 2091 2092 /// Base class for widened induction (VPWidenIntOrFpInductionRecipe and 2093 /// VPWidenPointerInductionRecipe), providing shared functionality, including 2094 /// retrieving the step value, induction descriptor and original phi node. 2095 class VPWidenInductionRecipe : public VPHeaderPHIRecipe { 2096 const InductionDescriptor &IndDesc; 2097 2098 public: 2099 VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start, 2100 VPValue *Step, const InductionDescriptor &IndDesc, 2101 DebugLoc DL) 2102 : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) { 2103 addOperand(Step); 2104 } 2105 2106 static inline bool classof(const VPRecipeBase *R) { 2107 return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC || 2108 R->getVPDefID() == VPDef::VPWidenPointerInductionSC; 2109 } 2110 2111 static inline bool classof(const VPValue *V) { 2112 auto *R = V->getDefiningRecipe(); 2113 return R && classof(R); 2114 } 2115 2116 static inline bool classof(const VPHeaderPHIRecipe *R) { 2117 return classof(static_cast<const VPRecipeBase *>(R)); 2118 } 2119 2120 virtual void execute(VPTransformState &State) override = 0; 2121 2122 /// Returns the step value of the induction. 2123 VPValue *getStepValue() { return getOperand(1); } 2124 const VPValue *getStepValue() const { return getOperand(1); } 2125 2126 PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); } 2127 2128 /// Returns the induction descriptor for the recipe. 2129 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } 2130 2131 VPValue *getBackedgeValue() override { 2132 // TODO: All operands of base recipe must exist and be at same index in 2133 // derived recipe. 2134 llvm_unreachable( 2135 "VPWidenIntOrFpInductionRecipe generates its own backedge value"); 2136 } 2137 2138 VPRecipeBase &getBackedgeRecipe() override { 2139 // TODO: All operands of base recipe must exist and be at same index in 2140 // derived recipe. 2141 llvm_unreachable( 2142 "VPWidenIntOrFpInductionRecipe generates its own backedge value"); 2143 } 2144 }; 2145 2146 /// A recipe for handling phi nodes of integer and floating-point inductions, 2147 /// producing their vector values. 2148 class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { 2149 TruncInst *Trunc; 2150 2151 public: 2152 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, 2153 VPValue *VF, const InductionDescriptor &IndDesc, 2154 DebugLoc DL) 2155 : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, 2156 Step, IndDesc, DL), 2157 Trunc(nullptr) { 2158 addOperand(VF); 2159 } 2160 2161 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, 2162 VPValue *VF, const InductionDescriptor &IndDesc, 2163 TruncInst *Trunc, DebugLoc DL) 2164 : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, 2165 Step, IndDesc, DL), 2166 Trunc(Trunc) { 2167 addOperand(VF); 2168 } 2169 2170 ~VPWidenIntOrFpInductionRecipe() override = default; 2171 2172 VPWidenIntOrFpInductionRecipe *clone() override { 2173 return new VPWidenIntOrFpInductionRecipe( 2174 getPHINode(), getStartValue(), getStepValue(), getVFValue(), 2175 getInductionDescriptor(), Trunc, getDebugLoc()); 2176 } 2177 2178 VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) 2179 2180 /// Generate the vectorized and scalarized versions of the phi node as 2181 /// needed by their users. 2182 void execute(VPTransformState &State) override; 2183 2184 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2185 /// Print the recipe. 2186 void print(raw_ostream &O, const Twine &Indent, 2187 VPSlotTracker &SlotTracker) const override; 2188 #endif 2189 2190 VPValue *getVFValue() { return getOperand(2); } 2191 const VPValue *getVFValue() const { return getOperand(2); } 2192 2193 VPValue *getSplatVFValue() { 2194 // If the recipe has been unrolled (4 operands), return the VPValue for the 2195 // induction increment. 2196 return getNumOperands() == 5 ? getOperand(3) : nullptr; 2197 } 2198 2199 /// Returns the first defined value as TruncInst, if it is one or nullptr 2200 /// otherwise. 2201 TruncInst *getTruncInst() { return Trunc; } 2202 const TruncInst *getTruncInst() const { return Trunc; } 2203 2204 /// Returns true if the induction is canonical, i.e. starting at 0 and 2205 /// incremented by UF * VF (= the original IV is incremented by 1) and has the 2206 /// same type as the canonical induction. 2207 bool isCanonical() const; 2208 2209 /// Returns the scalar type of the induction. 2210 Type *getScalarType() const { 2211 return Trunc ? Trunc->getType() : getPHINode()->getType(); 2212 } 2213 2214 /// Returns the VPValue representing the value of this induction at 2215 /// the last unrolled part, if it exists. Returns itself if unrolling did not 2216 /// take place. 2217 VPValue *getLastUnrolledPartOperand() { 2218 return getNumOperands() == 5 ? getOperand(4) : this; 2219 } 2220 }; 2221 2222 class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe, 2223 public VPUnrollPartAccessor<3> { 2224 bool IsScalarAfterVectorization; 2225 2226 public: 2227 /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p 2228 /// Start. 2229 VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step, 2230 const InductionDescriptor &IndDesc, 2231 bool IsScalarAfterVectorization, DebugLoc DL) 2232 : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start, 2233 Step, IndDesc, DL), 2234 IsScalarAfterVectorization(IsScalarAfterVectorization) {} 2235 2236 ~VPWidenPointerInductionRecipe() override = default; 2237 2238 VPWidenPointerInductionRecipe *clone() override { 2239 return new VPWidenPointerInductionRecipe( 2240 cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1), 2241 getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc()); 2242 } 2243 2244 VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) 2245 2246 /// Generate vector values for the pointer induction. 2247 void execute(VPTransformState &State) override; 2248 2249 /// Returns true if only scalar values will be generated. 2250 bool onlyScalarsGenerated(bool IsScalable); 2251 2252 /// Returns the VPValue representing the value of this induction at 2253 /// the first unrolled part, if it exists. Returns itself if unrolling did not 2254 /// take place. 2255 VPValue *getFirstUnrolledPartOperand() { 2256 return getUnrollPart(*this) == 0 ? this : getOperand(2); 2257 } 2258 2259 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2260 /// Print the recipe. 2261 void print(raw_ostream &O, const Twine &Indent, 2262 VPSlotTracker &SlotTracker) const override; 2263 #endif 2264 }; 2265 2266 /// Recipe to generate a scalar PHI. Used to generate code for recipes that 2267 /// produce scalar header phis, including VPCanonicalIVPHIRecipe and 2268 /// VPEVLBasedIVPHIRecipe. 2269 class VPScalarPHIRecipe : public VPHeaderPHIRecipe { 2270 std::string Name; 2271 2272 public: 2273 VPScalarPHIRecipe(VPValue *Start, VPValue *BackedgeValue, DebugLoc DL, 2274 StringRef Name) 2275 : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, nullptr, Start, DL), 2276 Name(Name.str()) { 2277 addOperand(BackedgeValue); 2278 } 2279 2280 ~VPScalarPHIRecipe() override = default; 2281 2282 VPScalarPHIRecipe *clone() override { 2283 llvm_unreachable("cloning not implemented yet"); 2284 } 2285 2286 VP_CLASSOF_IMPL(VPDef::VPScalarPHISC) 2287 2288 /// Generate the phi/select nodes. 2289 void execute(VPTransformState &State) override; 2290 2291 /// Returns true if the recipe only uses the first lane of operand \p Op. 2292 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2293 assert(is_contained(operands(), Op) && 2294 "Op must be an operand of the recipe"); 2295 return true; 2296 } 2297 2298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2299 /// Print the recipe. 2300 void print(raw_ostream &O, const Twine &Indent, 2301 VPSlotTracker &SlotTracker) const override; 2302 #endif 2303 }; 2304 2305 /// A recipe for handling phis that are widened in the vector loop. 2306 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are 2307 /// managed in the recipe directly. 2308 class VPWidenPHIRecipe : public VPSingleDefRecipe { 2309 /// List of incoming blocks. Only used in the VPlan native path. 2310 SmallVector<VPBasicBlock *, 2> IncomingBlocks; 2311 2312 public: 2313 /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and 2314 /// debug location \p DL. 2315 VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {}) 2316 : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL) { 2317 if (Start) 2318 addOperand(Start); 2319 } 2320 2321 VPWidenPHIRecipe *clone() override { 2322 llvm_unreachable("cloning not implemented yet"); 2323 } 2324 2325 ~VPWidenPHIRecipe() override = default; 2326 2327 VP_CLASSOF_IMPL(VPDef::VPWidenPHISC) 2328 2329 /// Generate the phi/select nodes. 2330 void execute(VPTransformState &State) override; 2331 2332 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2333 /// Print the recipe. 2334 void print(raw_ostream &O, const Twine &Indent, 2335 VPSlotTracker &SlotTracker) const override; 2336 #endif 2337 2338 /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. 2339 void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { 2340 addOperand(IncomingV); 2341 IncomingBlocks.push_back(IncomingBlock); 2342 } 2343 2344 /// Returns the \p I th incoming VPBasicBlock. 2345 VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } 2346 2347 /// Returns the \p I th incoming VPValue. 2348 VPValue *getIncomingValue(unsigned I) { return getOperand(I); } 2349 }; 2350 2351 /// A recipe for handling first-order recurrence phis. The start value is the 2352 /// first operand of the recipe and the incoming value from the backedge is the 2353 /// second operand. 2354 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { 2355 VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start) 2356 : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {} 2357 2358 VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC) 2359 2360 static inline bool classof(const VPHeaderPHIRecipe *R) { 2361 return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC; 2362 } 2363 2364 VPFirstOrderRecurrencePHIRecipe *clone() override { 2365 return new VPFirstOrderRecurrencePHIRecipe( 2366 cast<PHINode>(getUnderlyingInstr()), *getOperand(0)); 2367 } 2368 2369 void execute(VPTransformState &State) override; 2370 2371 /// Return the cost of this first-order recurrence phi recipe. 2372 InstructionCost computeCost(ElementCount VF, 2373 VPCostContext &Ctx) const override; 2374 2375 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2376 /// Print the recipe. 2377 void print(raw_ostream &O, const Twine &Indent, 2378 VPSlotTracker &SlotTracker) const override; 2379 #endif 2380 }; 2381 2382 /// A recipe for handling reduction phis. The start value is the first operand 2383 /// of the recipe and the incoming value from the backedge is the second 2384 /// operand. 2385 class VPReductionPHIRecipe : public VPHeaderPHIRecipe, 2386 public VPUnrollPartAccessor<2> { 2387 /// Descriptor for the reduction. 2388 const RecurrenceDescriptor &RdxDesc; 2389 2390 /// The phi is part of an in-loop reduction. 2391 bool IsInLoop; 2392 2393 /// The phi is part of an ordered reduction. Requires IsInLoop to be true. 2394 bool IsOrdered; 2395 2396 /// When expanding the reduction PHI, the plan's VF element count is divided 2397 /// by this factor to form the reduction phi's VF. 2398 unsigned VFScaleFactor = 1; 2399 2400 public: 2401 /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p 2402 /// RdxDesc. 2403 VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, 2404 VPValue &Start, bool IsInLoop = false, 2405 bool IsOrdered = false, unsigned VFScaleFactor = 1) 2406 : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), 2407 RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), 2408 VFScaleFactor(VFScaleFactor) { 2409 assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); 2410 } 2411 2412 ~VPReductionPHIRecipe() override = default; 2413 2414 VPReductionPHIRecipe *clone() override { 2415 auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), 2416 RdxDesc, *getOperand(0), IsInLoop, 2417 IsOrdered, VFScaleFactor); 2418 R->addOperand(getBackedgeValue()); 2419 return R; 2420 } 2421 2422 VP_CLASSOF_IMPL(VPDef::VPReductionPHISC) 2423 2424 static inline bool classof(const VPHeaderPHIRecipe *R) { 2425 return R->getVPDefID() == VPDef::VPReductionPHISC; 2426 } 2427 2428 /// Generate the phi/select nodes. 2429 void execute(VPTransformState &State) override; 2430 2431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2432 /// Print the recipe. 2433 void print(raw_ostream &O, const Twine &Indent, 2434 VPSlotTracker &SlotTracker) const override; 2435 #endif 2436 2437 const RecurrenceDescriptor &getRecurrenceDescriptor() const { 2438 return RdxDesc; 2439 } 2440 2441 /// Returns true, if the phi is part of an ordered reduction. 2442 bool isOrdered() const { return IsOrdered; } 2443 2444 /// Returns true, if the phi is part of an in-loop reduction. 2445 bool isInLoop() const { return IsInLoop; } 2446 }; 2447 2448 /// A recipe for forming partial reductions. In the loop, an accumulator and 2449 /// vector operand are added together and passed to the next iteration as the 2450 /// next accumulator. After the loop body, the accumulator is reduced to a 2451 /// scalar value. 2452 class VPPartialReductionRecipe : public VPSingleDefRecipe { 2453 unsigned Opcode; 2454 2455 public: 2456 VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, 2457 VPValue *Op1) 2458 : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, 2459 ReductionInst) {} 2460 VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, 2461 Instruction *ReductionInst = nullptr) 2462 : VPSingleDefRecipe(VPDef::VPPartialReductionSC, 2463 ArrayRef<VPValue *>({Op0, Op1}), ReductionInst), 2464 Opcode(Opcode) { 2465 [[maybe_unused]] auto *AccumulatorRecipe = 2466 getOperand(1)->getDefiningRecipe(); 2467 assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || 2468 isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && 2469 "Unexpected operand order for partial reduction recipe"); 2470 } 2471 ~VPPartialReductionRecipe() override = default; 2472 2473 VPPartialReductionRecipe *clone() override { 2474 return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), 2475 getUnderlyingInstr()); 2476 } 2477 2478 VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) 2479 2480 /// Generate the reduction in the loop. 2481 void execute(VPTransformState &State) override; 2482 2483 /// Return the cost of this VPPartialReductionRecipe. 2484 InstructionCost computeCost(ElementCount VF, 2485 VPCostContext &Ctx) const override; 2486 2487 /// Get the binary op's opcode. 2488 unsigned getOpcode() const { return Opcode; } 2489 2490 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2491 /// Print the recipe. 2492 void print(raw_ostream &O, const Twine &Indent, 2493 VPSlotTracker &SlotTracker) const override; 2494 #endif 2495 }; 2496 2497 /// A recipe for vectorizing a phi-node as a sequence of mask-based select 2498 /// instructions. 2499 class VPBlendRecipe : public VPSingleDefRecipe { 2500 public: 2501 /// The blend operation is a User of the incoming values and of their 2502 /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can 2503 /// be omitted (implied by passing an odd number of operands) in which case 2504 /// all other incoming values are merged into it. 2505 VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) 2506 : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { 2507 assert(Operands.size() > 0 && "Expected at least one operand!"); 2508 } 2509 2510 VPBlendRecipe *clone() override { 2511 SmallVector<VPValue *> Ops(operands()); 2512 return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops); 2513 } 2514 2515 VP_CLASSOF_IMPL(VPDef::VPBlendSC) 2516 2517 /// A normalized blend is one that has an odd number of operands, whereby the 2518 /// first operand does not have an associated mask. 2519 bool isNormalized() const { return getNumOperands() % 2; } 2520 2521 /// Return the number of incoming values, taking into account when normalized 2522 /// the first incoming value will have no mask. 2523 unsigned getNumIncomingValues() const { 2524 return (getNumOperands() + isNormalized()) / 2; 2525 } 2526 2527 /// Return incoming value number \p Idx. 2528 VPValue *getIncomingValue(unsigned Idx) const { 2529 return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized()); 2530 } 2531 2532 /// Return mask number \p Idx. 2533 VPValue *getMask(unsigned Idx) const { 2534 assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); 2535 return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); 2536 } 2537 2538 /// Generate the phi/select nodes. 2539 void execute(VPTransformState &State) override; 2540 2541 /// Return the cost of this VPWidenMemoryRecipe. 2542 InstructionCost computeCost(ElementCount VF, 2543 VPCostContext &Ctx) const override; 2544 2545 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2546 /// Print the recipe. 2547 void print(raw_ostream &O, const Twine &Indent, 2548 VPSlotTracker &SlotTracker) const override; 2549 #endif 2550 2551 /// Returns true if the recipe only uses the first lane of operand \p Op. 2552 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2553 assert(is_contained(operands(), Op) && 2554 "Op must be an operand of the recipe"); 2555 // Recursing through Blend recipes only, must terminate at header phi's the 2556 // latest. 2557 return all_of(users(), 2558 [this](VPUser *U) { return U->onlyFirstLaneUsed(this); }); 2559 } 2560 }; 2561 2562 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load 2563 /// or stores into one wide load/store and shuffles. The first operand of a 2564 /// VPInterleave recipe is the address, followed by the stored values, followed 2565 /// by an optional mask. 2566 class VPInterleaveRecipe : public VPRecipeBase { 2567 const InterleaveGroup<Instruction> *IG; 2568 2569 /// Indicates if the interleave group is in a conditional block and requires a 2570 /// mask. 2571 bool HasMask = false; 2572 2573 /// Indicates if gaps between members of the group need to be masked out or if 2574 /// unusued gaps can be loaded speculatively. 2575 bool NeedsMaskForGaps = false; 2576 2577 public: 2578 VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, 2579 ArrayRef<VPValue *> StoredValues, VPValue *Mask, 2580 bool NeedsMaskForGaps) 2581 : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG), 2582 NeedsMaskForGaps(NeedsMaskForGaps) { 2583 for (unsigned i = 0; i < IG->getFactor(); ++i) 2584 if (Instruction *I = IG->getMember(i)) { 2585 if (I->getType()->isVoidTy()) 2586 continue; 2587 new VPValue(I, this); 2588 } 2589 2590 for (auto *SV : StoredValues) 2591 addOperand(SV); 2592 if (Mask) { 2593 HasMask = true; 2594 addOperand(Mask); 2595 } 2596 } 2597 ~VPInterleaveRecipe() override = default; 2598 2599 VPInterleaveRecipe *clone() override { 2600 return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), 2601 NeedsMaskForGaps); 2602 } 2603 2604 VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) 2605 2606 /// Return the address accessed by this recipe. 2607 VPValue *getAddr() const { 2608 return getOperand(0); // Address is the 1st, mandatory operand. 2609 } 2610 2611 /// Return the mask used by this recipe. Note that a full mask is represented 2612 /// by a nullptr. 2613 VPValue *getMask() const { 2614 // Mask is optional and therefore the last, currently 2nd operand. 2615 return HasMask ? getOperand(getNumOperands() - 1) : nullptr; 2616 } 2617 2618 /// Return the VPValues stored by this interleave group. If it is a load 2619 /// interleave group, return an empty ArrayRef. 2620 ArrayRef<VPValue *> getStoredValues() const { 2621 // The first operand is the address, followed by the stored values, followed 2622 // by an optional mask. 2623 return ArrayRef<VPValue *>(op_begin(), getNumOperands()) 2624 .slice(1, getNumStoreOperands()); 2625 } 2626 2627 /// Generate the wide load or store, and shuffles. 2628 void execute(VPTransformState &State) override; 2629 2630 /// Return the cost of this VPInterleaveRecipe. 2631 InstructionCost computeCost(ElementCount VF, 2632 VPCostContext &Ctx) const override; 2633 2634 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2635 /// Print the recipe. 2636 void print(raw_ostream &O, const Twine &Indent, 2637 VPSlotTracker &SlotTracker) const override; 2638 #endif 2639 2640 const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } 2641 2642 /// Returns the number of stored operands of this interleave group. Returns 0 2643 /// for load interleave groups. 2644 unsigned getNumStoreOperands() const { 2645 return getNumOperands() - (HasMask ? 2 : 1); 2646 } 2647 2648 /// The recipe only uses the first lane of the address. 2649 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2650 assert(is_contained(operands(), Op) && 2651 "Op must be an operand of the recipe"); 2652 return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); 2653 } 2654 2655 Instruction *getInsertPos() const { return IG->getInsertPos(); } 2656 }; 2657 2658 /// A recipe to represent inloop reduction operations, performing a reduction on 2659 /// a vector operand into a scalar value, and adding the result to a chain. 2660 /// The Operands are {ChainOp, VecOp, [Condition]}. 2661 class VPReductionRecipe : public VPSingleDefRecipe { 2662 /// The recurrence decriptor for the reduction in question. 2663 const RecurrenceDescriptor &RdxDesc; 2664 bool IsOrdered; 2665 /// Whether the reduction is conditional. 2666 bool IsConditional = false; 2667 2668 protected: 2669 VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, 2670 Instruction *I, ArrayRef<VPValue *> Operands, 2671 VPValue *CondOp, bool IsOrdered, DebugLoc DL) 2672 : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R), 2673 IsOrdered(IsOrdered) { 2674 if (CondOp) { 2675 IsConditional = true; 2676 addOperand(CondOp); 2677 } 2678 } 2679 2680 public: 2681 VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, 2682 VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, 2683 bool IsOrdered, DebugLoc DL = {}) 2684 : VPReductionRecipe(VPDef::VPReductionSC, R, I, 2685 ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, 2686 IsOrdered, DL) {} 2687 2688 ~VPReductionRecipe() override = default; 2689 2690 VPReductionRecipe *clone() override { 2691 return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), 2692 getVecOp(), getCondOp(), IsOrdered, 2693 getDebugLoc()); 2694 } 2695 2696 static inline bool classof(const VPRecipeBase *R) { 2697 return R->getVPDefID() == VPRecipeBase::VPReductionSC || 2698 R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; 2699 } 2700 2701 static inline bool classof(const VPUser *U) { 2702 auto *R = dyn_cast<VPRecipeBase>(U); 2703 return R && classof(R); 2704 } 2705 2706 /// Generate the reduction in the loop. 2707 void execute(VPTransformState &State) override; 2708 2709 /// Return the cost of VPReductionRecipe. 2710 InstructionCost computeCost(ElementCount VF, 2711 VPCostContext &Ctx) const override; 2712 2713 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2714 /// Print the recipe. 2715 void print(raw_ostream &O, const Twine &Indent, 2716 VPSlotTracker &SlotTracker) const override; 2717 #endif 2718 2719 /// Return the recurrence decriptor for the in-loop reduction. 2720 const RecurrenceDescriptor &getRecurrenceDescriptor() const { 2721 return RdxDesc; 2722 } 2723 /// Return true if the in-loop reduction is ordered. 2724 bool isOrdered() const { return IsOrdered; }; 2725 /// Return true if the in-loop reduction is conditional. 2726 bool isConditional() const { return IsConditional; }; 2727 /// The VPValue of the scalar Chain being accumulated. 2728 VPValue *getChainOp() const { return getOperand(0); } 2729 /// The VPValue of the vector value to be reduced. 2730 VPValue *getVecOp() const { return getOperand(1); } 2731 /// The VPValue of the condition for the block. 2732 VPValue *getCondOp() const { 2733 return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; 2734 } 2735 }; 2736 2737 /// A recipe to represent inloop reduction operations with vector-predication 2738 /// intrinsics, performing a reduction on a vector operand with the explicit 2739 /// vector length (EVL) into a scalar value, and adding the result to a chain. 2740 /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. 2741 class VPReductionEVLRecipe : public VPReductionRecipe { 2742 public: 2743 VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp) 2744 : VPReductionRecipe( 2745 VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), 2746 cast_or_null<Instruction>(R.getUnderlyingValue()), 2747 ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, 2748 R.isOrdered(), R.getDebugLoc()) {} 2749 2750 ~VPReductionEVLRecipe() override = default; 2751 2752 VPReductionEVLRecipe *clone() override { 2753 llvm_unreachable("cloning not implemented yet"); 2754 } 2755 2756 VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC) 2757 2758 /// Generate the reduction in the loop 2759 void execute(VPTransformState &State) override; 2760 2761 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2762 /// Print the recipe. 2763 void print(raw_ostream &O, const Twine &Indent, 2764 VPSlotTracker &SlotTracker) const override; 2765 #endif 2766 2767 /// The VPValue of the explicit vector length. 2768 VPValue *getEVL() const { return getOperand(2); } 2769 2770 /// Returns true if the recipe only uses the first lane of operand \p Op. 2771 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2772 assert(is_contained(operands(), Op) && 2773 "Op must be an operand of the recipe"); 2774 return Op == getEVL(); 2775 } 2776 }; 2777 2778 /// VPReplicateRecipe replicates a given instruction producing multiple scalar 2779 /// copies of the original scalar type, one per lane, instead of producing a 2780 /// single copy of widened type for all lanes. If the instruction is known to be 2781 /// uniform only one copy, per lane zero, will be generated. 2782 class VPReplicateRecipe : public VPRecipeWithIRFlags { 2783 /// Indicator if only a single replica per lane is needed. 2784 bool IsUniform; 2785 2786 /// Indicator if the replicas are also predicated. 2787 bool IsPredicated; 2788 2789 public: 2790 template <typename IterT> 2791 VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, 2792 bool IsUniform, VPValue *Mask = nullptr) 2793 : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), 2794 IsUniform(IsUniform), IsPredicated(Mask) { 2795 if (Mask) 2796 addOperand(Mask); 2797 } 2798 2799 ~VPReplicateRecipe() override = default; 2800 2801 VPReplicateRecipe *clone() override { 2802 auto *Copy = 2803 new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform, 2804 isPredicated() ? getMask() : nullptr); 2805 Copy->transferFlags(*this); 2806 return Copy; 2807 } 2808 2809 VP_CLASSOF_IMPL(VPDef::VPReplicateSC) 2810 2811 /// Generate replicas of the desired Ingredient. Replicas will be generated 2812 /// for all parts and lanes unless a specific part and lane are specified in 2813 /// the \p State. 2814 void execute(VPTransformState &State) override; 2815 2816 /// Return the cost of this VPReplicateRecipe. 2817 InstructionCost computeCost(ElementCount VF, 2818 VPCostContext &Ctx) const override; 2819 2820 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2821 /// Print the recipe. 2822 void print(raw_ostream &O, const Twine &Indent, 2823 VPSlotTracker &SlotTracker) const override; 2824 #endif 2825 2826 bool isUniform() const { return IsUniform; } 2827 2828 bool isPredicated() const { return IsPredicated; } 2829 2830 /// Returns true if the recipe only uses the first lane of operand \p Op. 2831 bool onlyFirstLaneUsed(const VPValue *Op) const override { 2832 assert(is_contained(operands(), Op) && 2833 "Op must be an operand of the recipe"); 2834 return isUniform(); 2835 } 2836 2837 /// Returns true if the recipe uses scalars of operand \p Op. 2838 bool usesScalars(const VPValue *Op) const override { 2839 assert(is_contained(operands(), Op) && 2840 "Op must be an operand of the recipe"); 2841 return true; 2842 } 2843 2844 /// Returns true if the recipe is used by a widened recipe via an intervening 2845 /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed 2846 /// in a vector. 2847 bool shouldPack() const; 2848 2849 /// Return the mask of a predicated VPReplicateRecipe. 2850 VPValue *getMask() { 2851 assert(isPredicated() && "Trying to get the mask of a unpredicated recipe"); 2852 return getOperand(getNumOperands() - 1); 2853 } 2854 2855 unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); } 2856 }; 2857 2858 /// A recipe for generating conditional branches on the bits of a mask. 2859 class VPBranchOnMaskRecipe : public VPRecipeBase { 2860 public: 2861 VPBranchOnMaskRecipe(VPValue *BlockInMask) 2862 : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) { 2863 if (BlockInMask) // nullptr means all-one mask. 2864 addOperand(BlockInMask); 2865 } 2866 2867 VPBranchOnMaskRecipe *clone() override { 2868 return new VPBranchOnMaskRecipe(getOperand(0)); 2869 } 2870 2871 VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC) 2872 2873 /// Generate the extraction of the appropriate bit from the block mask and the 2874 /// conditional branch. 2875 void execute(VPTransformState &State) override; 2876 2877 /// Return the cost of this VPBranchOnMaskRecipe. 2878 InstructionCost computeCost(ElementCount VF, 2879 VPCostContext &Ctx) const override; 2880 2881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2882 /// Print the recipe. 2883 void print(raw_ostream &O, const Twine &Indent, 2884 VPSlotTracker &SlotTracker) const override { 2885 O << Indent << "BRANCH-ON-MASK "; 2886 if (VPValue *Mask = getMask()) 2887 Mask->printAsOperand(O, SlotTracker); 2888 else 2889 O << " All-One"; 2890 } 2891 #endif 2892 2893 /// Return the mask used by this recipe. Note that a full mask is represented 2894 /// by a nullptr. 2895 VPValue *getMask() const { 2896 assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); 2897 // Mask is optional. 2898 return getNumOperands() == 1 ? getOperand(0) : nullptr; 2899 } 2900 2901 /// Returns true if the recipe uses scalars of operand \p Op. 2902 bool usesScalars(const VPValue *Op) const override { 2903 assert(is_contained(operands(), Op) && 2904 "Op must be an operand of the recipe"); 2905 return true; 2906 } 2907 }; 2908 2909 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when 2910 /// control converges back from a Branch-on-Mask. The phi nodes are needed in 2911 /// order to merge values that are set under such a branch and feed their uses. 2912 /// The phi nodes can be scalar or vector depending on the users of the value. 2913 /// This recipe works in concert with VPBranchOnMaskRecipe. 2914 class VPPredInstPHIRecipe : public VPSingleDefRecipe { 2915 public: 2916 /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi 2917 /// nodes after merging back from a Branch-on-Mask. 2918 VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL) 2919 : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {} 2920 ~VPPredInstPHIRecipe() override = default; 2921 2922 VPPredInstPHIRecipe *clone() override { 2923 return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc()); 2924 } 2925 2926 VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC) 2927 2928 /// Generates phi nodes for live-outs (from a replicate region) as needed to 2929 /// retain SSA form. 2930 void execute(VPTransformState &State) override; 2931 2932 /// Return the cost of this VPPredInstPHIRecipe. 2933 InstructionCost computeCost(ElementCount VF, 2934 VPCostContext &Ctx) const override { 2935 // TODO: Compute accurate cost after retiring the legacy cost model. 2936 return 0; 2937 } 2938 2939 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2940 /// Print the recipe. 2941 void print(raw_ostream &O, const Twine &Indent, 2942 VPSlotTracker &SlotTracker) const override; 2943 #endif 2944 2945 /// Returns true if the recipe uses scalars of operand \p Op. 2946 bool usesScalars(const VPValue *Op) const override { 2947 assert(is_contained(operands(), Op) && 2948 "Op must be an operand of the recipe"); 2949 return true; 2950 } 2951 }; 2952 2953 /// A common base class for widening memory operations. An optional mask can be 2954 /// provided as the last operand. 2955 class VPWidenMemoryRecipe : public VPRecipeBase { 2956 protected: 2957 Instruction &Ingredient; 2958 2959 /// Whether the accessed addresses are consecutive. 2960 bool Consecutive; 2961 2962 /// Whether the consecutive accessed addresses are in reverse order. 2963 bool Reverse; 2964 2965 /// Whether the memory access is masked. 2966 bool IsMasked = false; 2967 2968 void setMask(VPValue *Mask) { 2969 assert(!IsMasked && "cannot re-set mask"); 2970 if (!Mask) 2971 return; 2972 addOperand(Mask); 2973 IsMasked = true; 2974 } 2975 2976 VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, 2977 std::initializer_list<VPValue *> Operands, 2978 bool Consecutive, bool Reverse, DebugLoc DL) 2979 : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive), 2980 Reverse(Reverse) { 2981 assert((Consecutive || !Reverse) && "Reverse implies consecutive"); 2982 } 2983 2984 public: 2985 VPWidenMemoryRecipe *clone() override { 2986 llvm_unreachable("cloning not supported"); 2987 } 2988 2989 static inline bool classof(const VPRecipeBase *R) { 2990 return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || 2991 R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || 2992 R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || 2993 R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; 2994 } 2995 2996 static inline bool classof(const VPUser *U) { 2997 auto *R = dyn_cast<VPRecipeBase>(U); 2998 return R && classof(R); 2999 } 3000 3001 /// Return whether the loaded-from / stored-to addresses are consecutive. 3002 bool isConsecutive() const { return Consecutive; } 3003 3004 /// Return whether the consecutive loaded/stored addresses are in reverse 3005 /// order. 3006 bool isReverse() const { return Reverse; } 3007 3008 /// Return the address accessed by this recipe. 3009 VPValue *getAddr() const { return getOperand(0); } 3010 3011 /// Returns true if the recipe is masked. 3012 bool isMasked() const { return IsMasked; } 3013 3014 /// Return the mask used by this recipe. Note that a full mask is represented 3015 /// by a nullptr. 3016 VPValue *getMask() const { 3017 // Mask is optional and therefore the last operand. 3018 return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; 3019 } 3020 3021 /// Generate the wide load/store. 3022 void execute(VPTransformState &State) override { 3023 llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); 3024 } 3025 3026 /// Return the cost of this VPWidenMemoryRecipe. 3027 InstructionCost computeCost(ElementCount VF, 3028 VPCostContext &Ctx) const override; 3029 3030 Instruction &getIngredient() const { return Ingredient; } 3031 }; 3032 3033 /// A recipe for widening load operations, using the address to load from and an 3034 /// optional mask. 3035 struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { 3036 VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, 3037 bool Consecutive, bool Reverse, DebugLoc DL) 3038 : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, 3039 Reverse, DL), 3040 VPValue(this, &Load) { 3041 setMask(Mask); 3042 } 3043 3044 VPWidenLoadRecipe *clone() override { 3045 return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(), 3046 getMask(), Consecutive, Reverse, 3047 getDebugLoc()); 3048 } 3049 3050 VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); 3051 3052 /// Generate a wide load or gather. 3053 void execute(VPTransformState &State) override; 3054 3055 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3056 /// Print the recipe. 3057 void print(raw_ostream &O, const Twine &Indent, 3058 VPSlotTracker &SlotTracker) const override; 3059 #endif 3060 3061 /// Returns true if the recipe only uses the first lane of operand \p Op. 3062 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3063 assert(is_contained(operands(), Op) && 3064 "Op must be an operand of the recipe"); 3065 // Widened, consecutive loads operations only demand the first lane of 3066 // their address. 3067 return Op == getAddr() && isConsecutive(); 3068 } 3069 }; 3070 3071 /// A recipe for widening load operations with vector-predication intrinsics, 3072 /// using the address to load from, the explicit vector length and an optional 3073 /// mask. 3074 struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { 3075 VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) 3076 : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), 3077 {L.getAddr(), &EVL}, L.isConsecutive(), 3078 L.isReverse(), L.getDebugLoc()), 3079 VPValue(this, &getIngredient()) { 3080 setMask(Mask); 3081 } 3082 3083 VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) 3084 3085 /// Return the EVL operand. 3086 VPValue *getEVL() const { return getOperand(1); } 3087 3088 /// Generate the wide load or gather. 3089 void execute(VPTransformState &State) override; 3090 3091 /// Return the cost of this VPWidenLoadEVLRecipe. 3092 InstructionCost computeCost(ElementCount VF, 3093 VPCostContext &Ctx) const override; 3094 3095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3096 /// Print the recipe. 3097 void print(raw_ostream &O, const Twine &Indent, 3098 VPSlotTracker &SlotTracker) const override; 3099 #endif 3100 3101 /// Returns true if the recipe only uses the first lane of operand \p Op. 3102 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3103 assert(is_contained(operands(), Op) && 3104 "Op must be an operand of the recipe"); 3105 // Widened loads only demand the first lane of EVL and consecutive loads 3106 // only demand the first lane of their address. 3107 return Op == getEVL() || (Op == getAddr() && isConsecutive()); 3108 } 3109 }; 3110 3111 /// A recipe for widening store operations, using the stored value, the address 3112 /// to store to and an optional mask. 3113 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { 3114 VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, 3115 VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL) 3116 : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, 3117 Consecutive, Reverse, DL) { 3118 setMask(Mask); 3119 } 3120 3121 VPWidenStoreRecipe *clone() override { 3122 return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(), 3123 getStoredValue(), getMask(), Consecutive, 3124 Reverse, getDebugLoc()); 3125 } 3126 3127 VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); 3128 3129 /// Return the value stored by this recipe. 3130 VPValue *getStoredValue() const { return getOperand(1); } 3131 3132 /// Generate a wide store or scatter. 3133 void execute(VPTransformState &State) override; 3134 3135 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3136 /// Print the recipe. 3137 void print(raw_ostream &O, const Twine &Indent, 3138 VPSlotTracker &SlotTracker) const override; 3139 #endif 3140 3141 /// Returns true if the recipe only uses the first lane of operand \p Op. 3142 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3143 assert(is_contained(operands(), Op) && 3144 "Op must be an operand of the recipe"); 3145 // Widened, consecutive stores only demand the first lane of their address, 3146 // unless the same operand is also stored. 3147 return Op == getAddr() && isConsecutive() && Op != getStoredValue(); 3148 } 3149 }; 3150 3151 /// A recipe for widening store operations with vector-predication intrinsics, 3152 /// using the value to store, the address to store to, the explicit vector 3153 /// length and an optional mask. 3154 struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { 3155 VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) 3156 : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), 3157 {S.getAddr(), S.getStoredValue(), &EVL}, 3158 S.isConsecutive(), S.isReverse(), S.getDebugLoc()) { 3159 setMask(Mask); 3160 } 3161 3162 VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) 3163 3164 /// Return the address accessed by this recipe. 3165 VPValue *getStoredValue() const { return getOperand(1); } 3166 3167 /// Return the EVL operand. 3168 VPValue *getEVL() const { return getOperand(2); } 3169 3170 /// Generate the wide store or scatter. 3171 void execute(VPTransformState &State) override; 3172 3173 /// Return the cost of this VPWidenStoreEVLRecipe. 3174 InstructionCost computeCost(ElementCount VF, 3175 VPCostContext &Ctx) const override; 3176 3177 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3178 /// Print the recipe. 3179 void print(raw_ostream &O, const Twine &Indent, 3180 VPSlotTracker &SlotTracker) const override; 3181 #endif 3182 3183 /// Returns true if the recipe only uses the first lane of operand \p Op. 3184 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3185 assert(is_contained(operands(), Op) && 3186 "Op must be an operand of the recipe"); 3187 if (Op == getEVL()) { 3188 assert(getStoredValue() != Op && "unexpected store of EVL"); 3189 return true; 3190 } 3191 // Widened, consecutive memory operations only demand the first lane of 3192 // their address, unless the same operand is also stored. That latter can 3193 // happen with opaque pointers. 3194 return Op == getAddr() && isConsecutive() && Op != getStoredValue(); 3195 } 3196 }; 3197 3198 /// Recipe to expand a SCEV expression. 3199 class VPExpandSCEVRecipe : public VPSingleDefRecipe { 3200 const SCEV *Expr; 3201 ScalarEvolution &SE; 3202 3203 public: 3204 VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE) 3205 : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {} 3206 3207 ~VPExpandSCEVRecipe() override = default; 3208 3209 VPExpandSCEVRecipe *clone() override { 3210 return new VPExpandSCEVRecipe(Expr, SE); 3211 } 3212 3213 VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC) 3214 3215 /// Generate a canonical vector induction variable of the vector loop, with 3216 void execute(VPTransformState &State) override; 3217 3218 /// Return the cost of this VPExpandSCEVRecipe. 3219 InstructionCost computeCost(ElementCount VF, 3220 VPCostContext &Ctx) const override { 3221 // TODO: Compute accurate cost after retiring the legacy cost model. 3222 return 0; 3223 } 3224 3225 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3226 /// Print the recipe. 3227 void print(raw_ostream &O, const Twine &Indent, 3228 VPSlotTracker &SlotTracker) const override; 3229 #endif 3230 3231 const SCEV *getSCEV() const { return Expr; } 3232 }; 3233 3234 /// Canonical scalar induction phi of the vector loop. Starting at the specified 3235 /// start value (either 0 or the resume value when vectorizing the epilogue 3236 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the 3237 /// canonical induction variable. 3238 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe { 3239 public: 3240 VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL) 3241 : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV, DL) {} 3242 3243 ~VPCanonicalIVPHIRecipe() override = default; 3244 3245 VPCanonicalIVPHIRecipe *clone() override { 3246 auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc()); 3247 R->addOperand(getBackedgeValue()); 3248 return R; 3249 } 3250 3251 VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC) 3252 3253 static inline bool classof(const VPHeaderPHIRecipe *D) { 3254 return D->getVPDefID() == VPDef::VPCanonicalIVPHISC; 3255 } 3256 3257 void execute(VPTransformState &State) override { 3258 llvm_unreachable( 3259 "cannot execute this recipe, should be replaced by VPScalarPHIRecipe"); 3260 } 3261 3262 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3263 /// Print the recipe. 3264 void print(raw_ostream &O, const Twine &Indent, 3265 VPSlotTracker &SlotTracker) const override; 3266 #endif 3267 3268 /// Returns the scalar type of the induction. 3269 Type *getScalarType() const { 3270 return getStartValue()->getLiveInIRValue()->getType(); 3271 } 3272 3273 /// Returns true if the recipe only uses the first lane of operand \p Op. 3274 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3275 assert(is_contained(operands(), Op) && 3276 "Op must be an operand of the recipe"); 3277 return true; 3278 } 3279 3280 /// Returns true if the recipe only uses the first part of operand \p Op. 3281 bool onlyFirstPartUsed(const VPValue *Op) const override { 3282 assert(is_contained(operands(), Op) && 3283 "Op must be an operand of the recipe"); 3284 return true; 3285 } 3286 3287 /// Return the cost of this VPCanonicalIVPHIRecipe. 3288 InstructionCost computeCost(ElementCount VF, 3289 VPCostContext &Ctx) const override { 3290 // For now, match the behavior of the legacy cost model. 3291 return 0; 3292 } 3293 }; 3294 3295 /// A recipe for generating the active lane mask for the vector loop that is 3296 /// used to predicate the vector operations. 3297 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and 3298 /// remove VPActiveLaneMaskPHIRecipe. 3299 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { 3300 public: 3301 VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) 3302 : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask, 3303 DL) {} 3304 3305 ~VPActiveLaneMaskPHIRecipe() override = default; 3306 3307 VPActiveLaneMaskPHIRecipe *clone() override { 3308 auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); 3309 if (getNumOperands() == 2) 3310 R->addOperand(getOperand(1)); 3311 return R; 3312 } 3313 3314 VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) 3315 3316 static inline bool classof(const VPHeaderPHIRecipe *D) { 3317 return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC; 3318 } 3319 3320 /// Generate the active lane mask phi of the vector loop. 3321 void execute(VPTransformState &State) override; 3322 3323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3324 /// Print the recipe. 3325 void print(raw_ostream &O, const Twine &Indent, 3326 VPSlotTracker &SlotTracker) const override; 3327 #endif 3328 }; 3329 3330 /// A recipe for generating the phi node for the current index of elements, 3331 /// adjusted in accordance with EVL value. It starts at the start value of the 3332 /// canonical induction and gets incremented by EVL in each iteration of the 3333 /// vector loop. 3334 class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { 3335 public: 3336 VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL) 3337 : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {} 3338 3339 ~VPEVLBasedIVPHIRecipe() override = default; 3340 3341 VPEVLBasedIVPHIRecipe *clone() override { 3342 llvm_unreachable("cloning not implemented yet"); 3343 } 3344 3345 VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC) 3346 3347 static inline bool classof(const VPHeaderPHIRecipe *D) { 3348 return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC; 3349 } 3350 3351 void execute(VPTransformState &State) override { 3352 llvm_unreachable( 3353 "cannot execute this recipe, should be replaced by VPScalarPHIRecipe"); 3354 } 3355 3356 /// Return the cost of this VPEVLBasedIVPHIRecipe. 3357 InstructionCost computeCost(ElementCount VF, 3358 VPCostContext &Ctx) const override { 3359 // For now, match the behavior of the legacy cost model. 3360 return 0; 3361 } 3362 3363 /// Returns true if the recipe only uses the first lane of operand \p Op. 3364 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3365 assert(is_contained(operands(), Op) && 3366 "Op must be an operand of the recipe"); 3367 return true; 3368 } 3369 3370 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3371 /// Print the recipe. 3372 void print(raw_ostream &O, const Twine &Indent, 3373 VPSlotTracker &SlotTracker) const override; 3374 #endif 3375 }; 3376 3377 /// A Recipe for widening the canonical induction variable of the vector loop. 3378 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe, 3379 public VPUnrollPartAccessor<1> { 3380 public: 3381 VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) 3382 : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} 3383 3384 ~VPWidenCanonicalIVRecipe() override = default; 3385 3386 VPWidenCanonicalIVRecipe *clone() override { 3387 return new VPWidenCanonicalIVRecipe( 3388 cast<VPCanonicalIVPHIRecipe>(getOperand(0))); 3389 } 3390 3391 VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC) 3392 3393 /// Generate a canonical vector induction variable of the vector loop, with 3394 /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and 3395 /// step = <VF*UF, VF*UF, ..., VF*UF>. 3396 void execute(VPTransformState &State) override; 3397 3398 /// Return the cost of this VPWidenCanonicalIVPHIRecipe. 3399 InstructionCost computeCost(ElementCount VF, 3400 VPCostContext &Ctx) const override { 3401 // TODO: Compute accurate cost after retiring the legacy cost model. 3402 return 0; 3403 } 3404 3405 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3406 /// Print the recipe. 3407 void print(raw_ostream &O, const Twine &Indent, 3408 VPSlotTracker &SlotTracker) const override; 3409 #endif 3410 }; 3411 3412 /// A recipe for converting the input value \p IV value to the corresponding 3413 /// value of an IV with different start and step values, using Start + IV * 3414 /// Step. 3415 class VPDerivedIVRecipe : public VPSingleDefRecipe { 3416 /// Kind of the induction. 3417 const InductionDescriptor::InductionKind Kind; 3418 /// If not nullptr, the floating point induction binary operator. Must be set 3419 /// for floating point inductions. 3420 const FPMathOperator *FPBinOp; 3421 3422 /// Name to use for the generated IR instruction for the derived IV. 3423 std::string Name; 3424 3425 public: 3426 VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start, 3427 VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step, 3428 const Twine &Name = "") 3429 : VPDerivedIVRecipe( 3430 IndDesc.getKind(), 3431 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()), 3432 Start, CanonicalIV, Step, Name) {} 3433 3434 VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, 3435 const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV, 3436 VPValue *Step, const Twine &Name = "") 3437 : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind), 3438 FPBinOp(FPBinOp), Name(Name.str()) {} 3439 3440 ~VPDerivedIVRecipe() override = default; 3441 3442 VPDerivedIVRecipe *clone() override { 3443 return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1), 3444 getStepValue()); 3445 } 3446 3447 VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC) 3448 3449 /// Generate the transformed value of the induction at offset StartValue (1. 3450 /// operand) + IV (2. operand) * StepValue (3, operand). 3451 void execute(VPTransformState &State) override; 3452 3453 /// Return the cost of this VPDerivedIVRecipe. 3454 InstructionCost computeCost(ElementCount VF, 3455 VPCostContext &Ctx) const override { 3456 // TODO: Compute accurate cost after retiring the legacy cost model. 3457 return 0; 3458 } 3459 3460 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3461 /// Print the recipe. 3462 void print(raw_ostream &O, const Twine &Indent, 3463 VPSlotTracker &SlotTracker) const override; 3464 #endif 3465 3466 Type *getScalarType() const { 3467 return getStartValue()->getLiveInIRValue()->getType(); 3468 } 3469 3470 VPValue *getStartValue() const { return getOperand(0); } 3471 VPValue *getStepValue() const { return getOperand(2); } 3472 3473 /// Returns true if the recipe only uses the first lane of operand \p Op. 3474 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3475 assert(is_contained(operands(), Op) && 3476 "Op must be an operand of the recipe"); 3477 return true; 3478 } 3479 }; 3480 3481 /// A recipe for handling phi nodes of integer and floating-point inductions, 3482 /// producing their scalar values. 3483 class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, 3484 public VPUnrollPartAccessor<2> { 3485 Instruction::BinaryOps InductionOpcode; 3486 3487 public: 3488 VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step, 3489 Instruction::BinaryOps Opcode, FastMathFlags FMFs) 3490 : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, 3491 ArrayRef<VPValue *>({IV, Step}), FMFs), 3492 InductionOpcode(Opcode) {} 3493 3494 VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, 3495 VPValue *Step) 3496 : VPScalarIVStepsRecipe( 3497 IV, Step, IndDesc.getInductionOpcode(), 3498 dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()) 3499 ? IndDesc.getInductionBinOp()->getFastMathFlags() 3500 : FastMathFlags()) {} 3501 3502 ~VPScalarIVStepsRecipe() override = default; 3503 3504 VPScalarIVStepsRecipe *clone() override { 3505 return new VPScalarIVStepsRecipe( 3506 getOperand(0), getOperand(1), InductionOpcode, 3507 hasFastMathFlags() ? getFastMathFlags() : FastMathFlags()); 3508 } 3509 3510 VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC) 3511 3512 /// Generate the scalarized versions of the phi node as needed by their users. 3513 void execute(VPTransformState &State) override; 3514 3515 /// Return the cost of this VPScalarIVStepsRecipe. 3516 InstructionCost computeCost(ElementCount VF, 3517 VPCostContext &Ctx) const override { 3518 // TODO: Compute accurate cost after retiring the legacy cost model. 3519 return 0; 3520 } 3521 3522 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3523 /// Print the recipe. 3524 void print(raw_ostream &O, const Twine &Indent, 3525 VPSlotTracker &SlotTracker) const override; 3526 #endif 3527 3528 VPValue *getStepValue() const { return getOperand(1); } 3529 3530 /// Returns true if the recipe only uses the first lane of operand \p Op. 3531 bool onlyFirstLaneUsed(const VPValue *Op) const override { 3532 assert(is_contained(operands(), Op) && 3533 "Op must be an operand of the recipe"); 3534 return true; 3535 } 3536 }; 3537 3538 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It 3539 /// holds a sequence of zero or more VPRecipe's each representing a sequence of 3540 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. 3541 class VPBasicBlock : public VPBlockBase { 3542 friend class VPlan; 3543 3544 /// Use VPlan::createVPBasicBlock to create VPBasicBlocks. 3545 VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) 3546 : VPBlockBase(VPBasicBlockSC, Name.str()) { 3547 if (Recipe) 3548 appendRecipe(Recipe); 3549 } 3550 3551 public: 3552 using RecipeListTy = iplist<VPRecipeBase>; 3553 3554 protected: 3555 /// The VPRecipes held in the order of output instructions to generate. 3556 RecipeListTy Recipes; 3557 3558 VPBasicBlock(const unsigned char BlockSC, const Twine &Name = "") 3559 : VPBlockBase(BlockSC, Name.str()) {} 3560 3561 public: 3562 ~VPBasicBlock() override { 3563 while (!Recipes.empty()) 3564 Recipes.pop_back(); 3565 } 3566 3567 /// Instruction iterators... 3568 using iterator = RecipeListTy::iterator; 3569 using const_iterator = RecipeListTy::const_iterator; 3570 using reverse_iterator = RecipeListTy::reverse_iterator; 3571 using const_reverse_iterator = RecipeListTy::const_reverse_iterator; 3572 3573 //===--------------------------------------------------------------------===// 3574 /// Recipe iterator methods 3575 /// 3576 inline iterator begin() { return Recipes.begin(); } 3577 inline const_iterator begin() const { return Recipes.begin(); } 3578 inline iterator end() { return Recipes.end(); } 3579 inline const_iterator end() const { return Recipes.end(); } 3580 3581 inline reverse_iterator rbegin() { return Recipes.rbegin(); } 3582 inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } 3583 inline reverse_iterator rend() { return Recipes.rend(); } 3584 inline const_reverse_iterator rend() const { return Recipes.rend(); } 3585 3586 inline size_t size() const { return Recipes.size(); } 3587 inline bool empty() const { return Recipes.empty(); } 3588 inline const VPRecipeBase &front() const { return Recipes.front(); } 3589 inline VPRecipeBase &front() { return Recipes.front(); } 3590 inline const VPRecipeBase &back() const { return Recipes.back(); } 3591 inline VPRecipeBase &back() { return Recipes.back(); } 3592 3593 /// Returns a reference to the list of recipes. 3594 RecipeListTy &getRecipeList() { return Recipes; } 3595 3596 /// Returns a pointer to a member of the recipe list. 3597 static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { 3598 return &VPBasicBlock::Recipes; 3599 } 3600 3601 /// Method to support type inquiry through isa, cast, and dyn_cast. 3602 static inline bool classof(const VPBlockBase *V) { 3603 return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC || 3604 V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; 3605 } 3606 3607 void insert(VPRecipeBase *Recipe, iterator InsertPt) { 3608 assert(Recipe && "No recipe to append."); 3609 assert(!Recipe->Parent && "Recipe already in VPlan"); 3610 Recipe->Parent = this; 3611 Recipes.insert(InsertPt, Recipe); 3612 } 3613 3614 /// Augment the existing recipes of a VPBasicBlock with an additional 3615 /// \p Recipe as the last recipe. 3616 void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); } 3617 3618 /// The method which generates the output IR instructions that correspond to 3619 /// this VPBasicBlock, thereby "executing" the VPlan. 3620 void execute(VPTransformState *State) override; 3621 3622 /// Return the cost of this VPBasicBlock. 3623 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; 3624 3625 /// Return the position of the first non-phi node recipe in the block. 3626 iterator getFirstNonPhi(); 3627 3628 /// Returns an iterator range over the PHI-like recipes in the block. 3629 iterator_range<iterator> phis() { 3630 return make_range(begin(), getFirstNonPhi()); 3631 } 3632 3633 /// Split current block at \p SplitAt by inserting a new block between the 3634 /// current block and its successors and moving all recipes starting at 3635 /// SplitAt to the new block. Returns the new block. 3636 VPBasicBlock *splitAt(iterator SplitAt); 3637 3638 VPRegionBlock *getEnclosingLoopRegion(); 3639 const VPRegionBlock *getEnclosingLoopRegion() const; 3640 3641 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3642 /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p 3643 /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. 3644 /// 3645 /// Note that the numbering is applied to the whole VPlan, so printing 3646 /// individual blocks is consistent with the whole VPlan printing. 3647 void print(raw_ostream &O, const Twine &Indent, 3648 VPSlotTracker &SlotTracker) const override; 3649 using VPBlockBase::print; // Get the print(raw_stream &O) version. 3650 #endif 3651 3652 /// If the block has multiple successors, return the branch recipe terminating 3653 /// the block. If there are no or only a single successor, return nullptr; 3654 VPRecipeBase *getTerminator(); 3655 const VPRecipeBase *getTerminator() const; 3656 3657 /// Returns true if the block is exiting it's parent region. 3658 bool isExiting() const; 3659 3660 /// Clone the current block and it's recipes, without updating the operands of 3661 /// the cloned recipes. 3662 VPBasicBlock *clone() override; 3663 3664 protected: 3665 /// Execute the recipes in the IR basic block \p BB. 3666 void executeRecipes(VPTransformState *State, BasicBlock *BB); 3667 3668 /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block 3669 /// generated for this VPBB. 3670 void connectToPredecessors(VPTransformState::CFGState &CFG); 3671 3672 private: 3673 /// Create an IR BasicBlock to hold the output instructions generated by this 3674 /// VPBasicBlock, and return it. Update the CFGState accordingly. 3675 BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); 3676 }; 3677 3678 /// A special type of VPBasicBlock that wraps an existing IR basic block. 3679 /// Recipes of the block get added before the first non-phi instruction in the 3680 /// wrapped block. 3681 /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's 3682 /// preheader block. 3683 class VPIRBasicBlock : public VPBasicBlock { 3684 friend class VPlan; 3685 3686 BasicBlock *IRBB; 3687 3688 /// Use VPlan::createVPIRBasicBlock to create VPIRBasicBlocks. 3689 VPIRBasicBlock(BasicBlock *IRBB) 3690 : VPBasicBlock(VPIRBasicBlockSC, 3691 (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()), 3692 IRBB(IRBB) {} 3693 3694 public: 3695 ~VPIRBasicBlock() override {} 3696 3697 static inline bool classof(const VPBlockBase *V) { 3698 return V->getVPBlockID() == VPBlockBase::VPIRBasicBlockSC; 3699 } 3700 3701 /// The method which generates the output IR instructions that correspond to 3702 /// this VPBasicBlock, thereby "executing" the VPlan. 3703 void execute(VPTransformState *State) override; 3704 3705 VPIRBasicBlock *clone() override; 3706 3707 BasicBlock *getIRBasicBlock() const { return IRBB; } 3708 }; 3709 3710 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks 3711 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG. 3712 /// A VPRegionBlock may indicate that its contents are to be replicated several 3713 /// times. This is designed to support predicated scalarization, in which a 3714 /// scalar if-then code structure needs to be generated VF * UF times. Having 3715 /// this replication indicator helps to keep a single model for multiple 3716 /// candidate VF's. The actual replication takes place only once the desired VF 3717 /// and UF have been determined. 3718 class VPRegionBlock : public VPBlockBase { 3719 friend class VPlan; 3720 3721 /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. 3722 VPBlockBase *Entry; 3723 3724 /// Hold the Single Exiting block of the SESE region modelled by the 3725 /// VPRegionBlock. 3726 VPBlockBase *Exiting; 3727 3728 /// An indicator whether this region is to generate multiple replicated 3729 /// instances of output IR corresponding to its VPBlockBases. 3730 bool IsReplicator; 3731 3732 /// Use VPlan::createVPRegionBlock to create VPRegionBlocks. 3733 VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, 3734 const std::string &Name = "", bool IsReplicator = false) 3735 : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), 3736 IsReplicator(IsReplicator) { 3737 assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); 3738 assert(Exiting->getSuccessors().empty() && "Exit block has successors."); 3739 Entry->setParent(this); 3740 Exiting->setParent(this); 3741 } 3742 VPRegionBlock(const std::string &Name = "", bool IsReplicator = false) 3743 : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), 3744 IsReplicator(IsReplicator) {} 3745 3746 public: 3747 ~VPRegionBlock() override {} 3748 3749 /// Method to support type inquiry through isa, cast, and dyn_cast. 3750 static inline bool classof(const VPBlockBase *V) { 3751 return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; 3752 } 3753 3754 const VPBlockBase *getEntry() const { return Entry; } 3755 VPBlockBase *getEntry() { return Entry; } 3756 3757 /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p 3758 /// EntryBlock must have no predecessors. 3759 void setEntry(VPBlockBase *EntryBlock) { 3760 assert(EntryBlock->getPredecessors().empty() && 3761 "Entry block cannot have predecessors."); 3762 Entry = EntryBlock; 3763 EntryBlock->setParent(this); 3764 } 3765 3766 const VPBlockBase *getExiting() const { return Exiting; } 3767 VPBlockBase *getExiting() { return Exiting; } 3768 3769 /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p 3770 /// ExitingBlock must have no successors. 3771 void setExiting(VPBlockBase *ExitingBlock) { 3772 assert(ExitingBlock->getSuccessors().empty() && 3773 "Exit block cannot have successors."); 3774 Exiting = ExitingBlock; 3775 ExitingBlock->setParent(this); 3776 } 3777 3778 /// Returns the pre-header VPBasicBlock of the loop region. 3779 VPBasicBlock *getPreheaderVPBB() { 3780 assert(!isReplicator() && "should only get pre-header of loop regions"); 3781 return getSinglePredecessor()->getExitingBasicBlock(); 3782 } 3783 3784 /// An indicator whether this region is to generate multiple replicated 3785 /// instances of output IR corresponding to its VPBlockBases. 3786 bool isReplicator() const { return IsReplicator; } 3787 3788 /// The method which generates the output IR instructions that correspond to 3789 /// this VPRegionBlock, thereby "executing" the VPlan. 3790 void execute(VPTransformState *State) override; 3791 3792 // Return the cost of this region. 3793 InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; 3794 3795 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 3796 /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with 3797 /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using 3798 /// consequtive numbers. 3799 /// 3800 /// Note that the numbering is applied to the whole VPlan, so printing 3801 /// individual regions is consistent with the whole VPlan printing. 3802 void print(raw_ostream &O, const Twine &Indent, 3803 VPSlotTracker &SlotTracker) const override; 3804 using VPBlockBase::print; // Get the print(raw_stream &O) version. 3805 #endif 3806 3807 /// Clone all blocks in the single-entry single-exit region of the block and 3808 /// their recipes without updating the operands of the cloned recipes. 3809 VPRegionBlock *clone() override; 3810 }; 3811 3812 /// VPlan models a candidate for vectorization, encoding various decisions take 3813 /// to produce efficient output IR, including which branches, basic-blocks and 3814 /// output IR instructions to generate, and their cost. VPlan holds a 3815 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry 3816 /// VPBasicBlock. 3817 class VPlan { 3818 friend class VPlanPrinter; 3819 friend class VPSlotTracker; 3820 3821 /// VPBasicBlock corresponding to the original preheader. Used to place 3822 /// VPExpandSCEV recipes for expressions used during skeleton creation and the 3823 /// rest of VPlan execution. 3824 /// When this VPlan is used for the epilogue vector loop, the entry will be 3825 /// replaced by a new entry block created during skeleton creation. 3826 VPBasicBlock *Entry; 3827 3828 /// VPIRBasicBlock wrapping the header of the original scalar loop. 3829 VPIRBasicBlock *ScalarHeader; 3830 3831 /// Holds the VFs applicable to this VPlan. 3832 SmallSetVector<ElementCount, 2> VFs; 3833 3834 /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for 3835 /// any UF. 3836 SmallSetVector<unsigned, 2> UFs; 3837 3838 /// Holds the name of the VPlan, for printing. 3839 std::string Name; 3840 3841 /// Represents the trip count of the original loop, for folding 3842 /// the tail. 3843 VPValue *TripCount = nullptr; 3844 3845 /// Represents the backedge taken count of the original loop, for folding 3846 /// the tail. It equals TripCount - 1. 3847 VPValue *BackedgeTakenCount = nullptr; 3848 3849 /// Represents the vector trip count. 3850 VPValue VectorTripCount; 3851 3852 /// Represents the vectorization factor of the loop. 3853 VPValue VF; 3854 3855 /// Represents the loop-invariant VF * UF of the vector loop region. 3856 VPValue VFxUF; 3857 3858 /// Holds a mapping between Values and their corresponding VPValue inside 3859 /// VPlan. 3860 Value2VPValueTy Value2VPValue; 3861 3862 /// Contains all the external definitions created for this VPlan. External 3863 /// definitions are VPValues that hold a pointer to their underlying IR. 3864 SmallVector<VPValue *, 16> VPLiveInsToFree; 3865 3866 /// Mapping from SCEVs to the VPValues representing their expansions. 3867 /// NOTE: This mapping is temporary and will be removed once all users have 3868 /// been modeled in VPlan directly. 3869 DenseMap<const SCEV *, VPValue *> SCEVToExpansion; 3870 3871 /// Blocks allocated and owned by the VPlan. They will be deleted once the 3872 /// VPlan is destroyed. 3873 SmallVector<VPBlockBase *> CreatedBlocks; 3874 3875 /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader 3876 /// wrapping the original header of the scalar loop. 3877 VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) 3878 : Entry(Entry), ScalarHeader(ScalarHeader) { 3879 Entry->setPlan(this); 3880 assert(ScalarHeader->getNumSuccessors() == 0 && 3881 "scalar header must be a leaf node"); 3882 } 3883 3884 public: 3885 /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the 3886 /// original preheader and scalar header of \p L, to be used as entry and 3887 /// scalar header blocks of the new VPlan. 3888 VPlan(Loop *L); 3889 3890 /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock 3891 /// wrapping \p ScalarHeaderBB and a trip count of \p TC. 3892 VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { 3893 setEntry(createVPBasicBlock("preheader")); 3894 ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); 3895 TripCount = TC; 3896 } 3897 3898 ~VPlan(); 3899 3900 void setEntry(VPBasicBlock *VPBB) { 3901 Entry = VPBB; 3902 VPBB->setPlan(this); 3903 } 3904 3905 /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping 3906 /// original scalar pre-header) which contains SCEV expansions that need 3907 /// to happen before the CFG is modified (when executing a VPlan for the 3908 /// epilogue vector loop, the original entry needs to be replaced by a new 3909 /// one); a VPBasicBlock for the vector pre-header, followed by a region for 3910 /// the vector loop, followed by the middle VPBasicBlock. If a check is needed 3911 /// to guard executing the scalar epilogue loop, it will be added to the 3912 /// middle block, together with VPBasicBlocks for the scalar preheader and 3913 /// exit blocks. \p InductionTy is the type of the canonical induction and 3914 /// used for related values, like the trip count expression. 3915 static VPlanPtr createInitialVPlan(Type *InductionTy, 3916 PredicatedScalarEvolution &PSE, 3917 bool RequiresScalarEpilogueCheck, 3918 bool TailFolded, Loop *TheLoop); 3919 3920 /// Prepare the plan for execution, setting up the required live-in values. 3921 void prepareToExecute(Value *TripCount, Value *VectorTripCount, 3922 VPTransformState &State); 3923 3924 /// Generate the IR code for this VPlan. 3925 void execute(VPTransformState *State); 3926 3927 /// Return the cost of this plan. 3928 InstructionCost cost(ElementCount VF, VPCostContext &Ctx); 3929 3930 VPBasicBlock *getEntry() { return Entry; } 3931 const VPBasicBlock *getEntry() const { return Entry; } 3932 3933 /// Returns the preheader of the vector loop region, if one exists, or null 3934 /// otherwise. 3935 VPBasicBlock *getVectorPreheader() { 3936 VPRegionBlock *VectorRegion = getVectorLoopRegion(); 3937 return VectorRegion 3938 ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()) 3939 : nullptr; 3940 } 3941 3942 /// Returns the VPRegionBlock of the vector loop. 3943 VPRegionBlock *getVectorLoopRegion(); 3944 const VPRegionBlock *getVectorLoopRegion() const; 3945 3946 /// Returns the 'middle' block of the plan, that is the block that selects 3947 /// whether to execute the scalar tail loop or the exit block from the loop 3948 /// latch. 3949 const VPBasicBlock *getMiddleBlock() const { 3950 return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); 3951 } 3952 VPBasicBlock *getMiddleBlock() { 3953 return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); 3954 } 3955 3956 /// Return the VPBasicBlock for the preheader of the scalar loop. 3957 VPBasicBlock *getScalarPreheader() const { 3958 return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor()); 3959 } 3960 3961 /// Return the VPIRBasicBlock wrapping the header of the scalar loop. 3962 VPIRBasicBlock *getScalarHeader() const { return ScalarHeader; } 3963 3964 /// Return an iterator range over the VPIRBasicBlock wrapping the exit blocks 3965 /// of the VPlan, that is leaf nodes except the scalar header. Defined in 3966 /// VPlanHCFG, as the definition of the type needs access to the definitions 3967 /// of VPBlockShallowTraversalWrapper. 3968 auto getExitBlocks(); 3969 3970 /// The trip count of the original loop. 3971 VPValue *getTripCount() const { 3972 assert(TripCount && "trip count needs to be set before accessing it"); 3973 return TripCount; 3974 } 3975 3976 /// Resets the trip count for the VPlan. The caller must make sure all uses of 3977 /// the original trip count have been replaced. 3978 void resetTripCount(VPValue *NewTripCount) { 3979 assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 && 3980 "TripCount always must be set"); 3981 TripCount = NewTripCount; 3982 } 3983 3984 /// The backedge taken count of the original loop. 3985 VPValue *getOrCreateBackedgeTakenCount() { 3986 if (!BackedgeTakenCount) 3987 BackedgeTakenCount = new VPValue(); 3988 return BackedgeTakenCount; 3989 } 3990 3991 /// The vector trip count. 3992 VPValue &getVectorTripCount() { return VectorTripCount; } 3993 3994 /// Returns the VF of the vector loop region. 3995 VPValue &getVF() { return VF; }; 3996 3997 /// Returns VF * UF of the vector loop region. 3998 VPValue &getVFxUF() { return VFxUF; } 3999 4000 void addVF(ElementCount VF) { VFs.insert(VF); } 4001 4002 void setVF(ElementCount VF) { 4003 assert(hasVF(VF) && "Cannot set VF not already in plan"); 4004 VFs.clear(); 4005 VFs.insert(VF); 4006 } 4007 4008 bool hasVF(ElementCount VF) { return VFs.count(VF); } 4009 bool hasScalableVF() { 4010 return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); }); 4011 } 4012 4013 /// Returns an iterator range over all VFs of the plan. 4014 iterator_range<SmallSetVector<ElementCount, 2>::iterator> 4015 vectorFactors() const { 4016 return {VFs.begin(), VFs.end()}; 4017 } 4018 4019 bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } 4020 4021 bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } 4022 4023 unsigned getUF() const { 4024 assert(UFs.size() == 1 && "Expected a single UF"); 4025 return UFs[0]; 4026 } 4027 4028 void setUF(unsigned UF) { 4029 assert(hasUF(UF) && "Cannot set the UF not already in plan"); 4030 UFs.clear(); 4031 UFs.insert(UF); 4032 } 4033 4034 /// Return a string with the name of the plan and the applicable VFs and UFs. 4035 std::string getName() const; 4036 4037 void setName(const Twine &newName) { Name = newName.str(); } 4038 4039 /// Gets the live-in VPValue for \p V or adds a new live-in (if none exists 4040 /// yet) for \p V. 4041 VPValue *getOrAddLiveIn(Value *V) { 4042 assert(V && "Trying to get or add the VPValue of a null Value"); 4043 if (!Value2VPValue.count(V)) { 4044 VPValue *VPV = new VPValue(V); 4045 VPLiveInsToFree.push_back(VPV); 4046 assert(VPV->isLiveIn() && "VPV must be a live-in."); 4047 assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); 4048 Value2VPValue[V] = VPV; 4049 } 4050 4051 assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); 4052 assert(Value2VPValue[V]->isLiveIn() && 4053 "Only live-ins should be in mapping"); 4054 return Value2VPValue[V]; 4055 } 4056 4057 /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. 4058 VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } 4059 4060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 4061 /// Print the live-ins of this VPlan to \p O. 4062 void printLiveIns(raw_ostream &O) const; 4063 4064 /// Print this VPlan to \p O. 4065 void print(raw_ostream &O) const; 4066 4067 /// Print this VPlan in DOT format to \p O. 4068 void printDOT(raw_ostream &O) const; 4069 4070 /// Dump the plan to stderr (for debugging). 4071 LLVM_DUMP_METHOD void dump() const; 4072 #endif 4073 4074 /// Returns the canonical induction recipe of the vector loop. 4075 VPCanonicalIVPHIRecipe *getCanonicalIV() { 4076 VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); 4077 if (EntryVPBB->empty()) { 4078 // VPlan native path. 4079 EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor()); 4080 } 4081 return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); 4082 } 4083 4084 VPValue *getSCEVExpansion(const SCEV *S) const { 4085 return SCEVToExpansion.lookup(S); 4086 } 4087 4088 void addSCEVExpansion(const SCEV *S, VPValue *V) { 4089 assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); 4090 SCEVToExpansion[S] = V; 4091 } 4092 4093 /// Clone the current VPlan, update all VPValues of the new VPlan and cloned 4094 /// recipes to refer to the clones, and return it. 4095 VPlan *duplicate(); 4096 4097 /// Create a new VPBasicBlock with \p Name and containing \p Recipe if 4098 /// present. The returned block is owned by the VPlan and deleted once the 4099 /// VPlan is destroyed. 4100 VPBasicBlock *createVPBasicBlock(const Twine &Name, 4101 VPRecipeBase *Recipe = nullptr) { 4102 auto *VPB = new VPBasicBlock(Name, Recipe); 4103 CreatedBlocks.push_back(VPB); 4104 return VPB; 4105 } 4106 4107 /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p 4108 /// IsReplicator is true, the region is a replicate region. The returned block 4109 /// is owned by the VPlan and deleted once the VPlan is destroyed. 4110 VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, 4111 const std::string &Name = "", 4112 bool IsReplicator = false) { 4113 auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); 4114 CreatedBlocks.push_back(VPB); 4115 return VPB; 4116 } 4117 4118 /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set 4119 /// to nullptr. If \p IsReplicator is true, the region is a replicate region. 4120 /// The returned block is owned by the VPlan and deleted once the VPlan is 4121 /// destroyed. 4122 VPRegionBlock *createVPRegionBlock(const std::string &Name = "", 4123 bool IsReplicator = false) { 4124 auto *VPB = new VPRegionBlock(Name, IsReplicator); 4125 CreatedBlocks.push_back(VPB); 4126 return VPB; 4127 } 4128 4129 /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create 4130 /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned 4131 /// block is owned by the VPlan and deleted once the VPlan is destroyed. 4132 VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB); 4133 4134 /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all 4135 /// instructions in \p IRBB, except its terminator which is managed by the 4136 /// successors of the block in VPlan. The returned block is owned by the VPlan 4137 /// and deleted once the VPlan is destroyed. 4138 VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); 4139 }; 4140 4141 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 4142 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is 4143 /// indented and follows the dot format. 4144 class VPlanPrinter { 4145 raw_ostream &OS; 4146 const VPlan &Plan; 4147 unsigned Depth = 0; 4148 unsigned TabWidth = 2; 4149 std::string Indent; 4150 unsigned BID = 0; 4151 SmallDenseMap<const VPBlockBase *, unsigned> BlockID; 4152 4153 VPSlotTracker SlotTracker; 4154 4155 /// Handle indentation. 4156 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } 4157 4158 /// Print a given \p Block of the Plan. 4159 void dumpBlock(const VPBlockBase *Block); 4160 4161 /// Print the information related to the CFG edges going out of a given 4162 /// \p Block, followed by printing the successor blocks themselves. 4163 void dumpEdges(const VPBlockBase *Block); 4164 4165 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing 4166 /// its successor blocks. 4167 void dumpBasicBlock(const VPBasicBlock *BasicBlock); 4168 4169 /// Print a given \p Region of the Plan. 4170 void dumpRegion(const VPRegionBlock *Region); 4171 4172 unsigned getOrCreateBID(const VPBlockBase *Block) { 4173 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; 4174 } 4175 4176 Twine getOrCreateName(const VPBlockBase *Block); 4177 4178 Twine getUID(const VPBlockBase *Block); 4179 4180 /// Print the information related to a CFG edge between two VPBlockBases. 4181 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, 4182 const Twine &Label); 4183 4184 public: 4185 VPlanPrinter(raw_ostream &O, const VPlan &P) 4186 : OS(O), Plan(P), SlotTracker(&P) {} 4187 4188 LLVM_DUMP_METHOD void dump(); 4189 }; 4190 4191 struct VPlanIngredient { 4192 const Value *V; 4193 4194 VPlanIngredient(const Value *V) : V(V) {} 4195 4196 void print(raw_ostream &O) const; 4197 }; 4198 4199 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { 4200 I.print(OS); 4201 return OS; 4202 } 4203 4204 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { 4205 Plan.print(OS); 4206 return OS; 4207 } 4208 #endif 4209 4210 class VPInterleavedAccessInfo { 4211 DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *> 4212 InterleaveGroupMap; 4213 4214 /// Type for mapping of instruction based interleave groups to VPInstruction 4215 /// interleave groups 4216 using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *, 4217 InterleaveGroup<VPInstruction> *>; 4218 4219 /// Recursively \p Region and populate VPlan based interleave groups based on 4220 /// \p IAI. 4221 void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, 4222 InterleavedAccessInfo &IAI); 4223 /// Recursively traverse \p Block and populate VPlan based interleave groups 4224 /// based on \p IAI. 4225 void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 4226 InterleavedAccessInfo &IAI); 4227 4228 public: 4229 VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); 4230 4231 ~VPInterleavedAccessInfo() { 4232 SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet; 4233 // Avoid releasing a pointer twice. 4234 for (auto &I : InterleaveGroupMap) 4235 DelSet.insert(I.second); 4236 for (auto *Ptr : DelSet) 4237 delete Ptr; 4238 } 4239 4240 /// Get the interleave group that \p Instr belongs to. 4241 /// 4242 /// \returns nullptr if doesn't have such group. 4243 InterleaveGroup<VPInstruction> * 4244 getInterleaveGroup(VPInstruction *Instr) const { 4245 return InterleaveGroupMap.lookup(Instr); 4246 } 4247 }; 4248 4249 /// Class that maps (parts of) an existing VPlan to trees of combined 4250 /// VPInstructions. 4251 class VPlanSlp { 4252 enum class OpMode { Failed, Load, Opcode }; 4253 4254 /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as 4255 /// DenseMap keys. 4256 struct BundleDenseMapInfo { 4257 static SmallVector<VPValue *, 4> getEmptyKey() { 4258 return {reinterpret_cast<VPValue *>(-1)}; 4259 } 4260 4261 static SmallVector<VPValue *, 4> getTombstoneKey() { 4262 return {reinterpret_cast<VPValue *>(-2)}; 4263 } 4264 4265 static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) { 4266 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 4267 } 4268 4269 static bool isEqual(const SmallVector<VPValue *, 4> &LHS, 4270 const SmallVector<VPValue *, 4> &RHS) { 4271 return LHS == RHS; 4272 } 4273 }; 4274 4275 /// Mapping of values in the original VPlan to a combined VPInstruction. 4276 DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo> 4277 BundleToCombined; 4278 4279 VPInterleavedAccessInfo &IAI; 4280 4281 /// Basic block to operate on. For now, only instructions in a single BB are 4282 /// considered. 4283 const VPBasicBlock &BB; 4284 4285 /// Indicates whether we managed to combine all visited instructions or not. 4286 bool CompletelySLP = true; 4287 4288 /// Width of the widest combined bundle in bits. 4289 unsigned WidestBundleBits = 0; 4290 4291 using MultiNodeOpTy = 4292 typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>; 4293 4294 // Input operand bundles for the current multi node. Each multi node operand 4295 // bundle contains values not matching the multi node's opcode. They will 4296 // be reordered in reorderMultiNodeOps, once we completed building a 4297 // multi node. 4298 SmallVector<MultiNodeOpTy, 4> MultiNodeOps; 4299 4300 /// Indicates whether we are building a multi node currently. 4301 bool MultiNodeActive = false; 4302 4303 /// Check if we can vectorize Operands together. 4304 bool areVectorizable(ArrayRef<VPValue *> Operands) const; 4305 4306 /// Add combined instruction \p New for the bundle \p Operands. 4307 void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New); 4308 4309 /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. 4310 VPInstruction *markFailed(); 4311 4312 /// Reorder operands in the multi node to maximize sequential memory access 4313 /// and commutative operations. 4314 SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps(); 4315 4316 /// Choose the best candidate to use for the lane after \p Last. The set of 4317 /// candidates to choose from are values with an opcode matching \p Last's 4318 /// or loads consecutive to \p Last. 4319 std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last, 4320 SmallPtrSetImpl<VPValue *> &Candidates, 4321 VPInterleavedAccessInfo &IAI); 4322 4323 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 4324 /// Print bundle \p Values to dbgs(). 4325 void dumpBundle(ArrayRef<VPValue *> Values); 4326 #endif 4327 4328 public: 4329 VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} 4330 4331 ~VPlanSlp() = default; 4332 4333 /// Tries to build an SLP tree rooted at \p Operands and returns a 4334 /// VPInstruction combining \p Operands, if they can be combined. 4335 VPInstruction *buildGraph(ArrayRef<VPValue *> Operands); 4336 4337 /// Return the width of the widest combined bundle in bits. 4338 unsigned getWidestBundleBits() const { return WidestBundleBits; } 4339 4340 /// Return true if all visited instruction can be combined. 4341 bool isCompletelySLP() const { return CompletelySLP; } 4342 }; 4343 } // end namespace llvm 4344 4345 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H 4346