xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp (revision b0da3c6fa44b18023e8421d4cceb0f806046729c)
1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This is the LLVM vectorization plan. It represents a candidate for
11 /// vectorization, allowing to plan and optimize how to vectorize a given loop
12 /// before generating LLVM-IR.
13 /// The vectorizer uses vectorization plans to estimate the costs of potential
14 /// candidates and if profitable to execute the desired plan, generating vector
15 /// LLVM-IR code.
16 ///
17 //===----------------------------------------------------------------------===//
18 
19 #include "VPlan.h"
20 #include "VPlanDominatorTree.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Twine.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/CFG.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/Instruction.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Value.h"
34 #include "llvm/Support/Casting.h"
35 #include "llvm/Support/CommandLine.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/GenericDomTreeConstruction.h"
38 #include "llvm/Support/GraphWriter.h"
39 #include "llvm/Support/raw_ostream.h"
40 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
41 #include "llvm/Transforms/Utils/LoopVersioning.h"
42 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
43 #include <cassert>
44 #include <string>
45 #include <vector>
46 
47 using namespace llvm;
48 extern cl::opt<bool> EnableVPlanNativePath;
49 
50 #define DEBUG_TYPE "vplan"
51 
52 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
53 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
54   const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
55   VPSlotTracker SlotTracker(
56       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
57   V.print(OS, SlotTracker);
58   return OS;
59 }
60 #endif
61 
62 Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
63                                 const ElementCount &VF) const {
64   switch (LaneKind) {
65   case VPLane::Kind::ScalableLast:
66     // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
67     return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
68                              Builder.getInt32(VF.getKnownMinValue() - Lane));
69   case VPLane::Kind::First:
70     return Builder.getInt32(Lane);
71   }
72   llvm_unreachable("Unknown lane kind");
73 }
74 
75 VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
76     : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
77   if (Def)
78     Def->addDefinedValue(this);
79 }
80 
81 VPValue::~VPValue() {
82   assert(Users.empty() && "trying to delete a VPValue with remaining users");
83   if (Def)
84     Def->removeDefinedValue(this);
85 }
86 
87 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
89   if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
90     R->print(OS, "", SlotTracker);
91   else
92     printAsOperand(OS, SlotTracker);
93 }
94 
95 void VPValue::dump() const {
96   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
97   VPSlotTracker SlotTracker(
98       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
99   print(dbgs(), SlotTracker);
100   dbgs() << "\n";
101 }
102 
103 void VPDef::dump() const {
104   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
105   VPSlotTracker SlotTracker(
106       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
107   print(dbgs(), "", SlotTracker);
108   dbgs() << "\n";
109 }
110 #endif
111 
112 // Get the top-most entry block of \p Start. This is the entry block of the
113 // containing VPlan. This function is templated to support both const and non-const blocks
114 template <typename T> static T *getPlanEntry(T *Start) {
115   T *Next = Start;
116   T *Current = Start;
117   while ((Next = Next->getParent()))
118     Current = Next;
119 
120   SmallSetVector<T *, 8> WorkList;
121   WorkList.insert(Current);
122 
123   for (unsigned i = 0; i < WorkList.size(); i++) {
124     T *Current = WorkList[i];
125     if (Current->getNumPredecessors() == 0)
126       return Current;
127     auto &Predecessors = Current->getPredecessors();
128     WorkList.insert(Predecessors.begin(), Predecessors.end());
129   }
130 
131   llvm_unreachable("VPlan without any entry node without predecessors");
132 }
133 
134 VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
135 
136 const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
137 
138 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
139 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
140   const VPBlockBase *Block = this;
141   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
142     Block = Region->getEntry();
143   return cast<VPBasicBlock>(Block);
144 }
145 
146 VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
147   VPBlockBase *Block = this;
148   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
149     Block = Region->getEntry();
150   return cast<VPBasicBlock>(Block);
151 }
152 
153 void VPBlockBase::setPlan(VPlan *ParentPlan) {
154   assert(ParentPlan->getEntry() == this &&
155          "Can only set plan on its entry block.");
156   Plan = ParentPlan;
157 }
158 
159 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
160 const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
161   const VPBlockBase *Block = this;
162   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
163     Block = Region->getExiting();
164   return cast<VPBasicBlock>(Block);
165 }
166 
167 VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
168   VPBlockBase *Block = this;
169   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
170     Block = Region->getExiting();
171   return cast<VPBasicBlock>(Block);
172 }
173 
174 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
175   if (!Successors.empty() || !Parent)
176     return this;
177   assert(Parent->getExiting() == this &&
178          "Block w/o successors not the exiting block of its parent.");
179   return Parent->getEnclosingBlockWithSuccessors();
180 }
181 
182 VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
183   if (!Predecessors.empty() || !Parent)
184     return this;
185   assert(Parent->getEntry() == this &&
186          "Block w/o predecessors not the entry of its parent.");
187   return Parent->getEnclosingBlockWithPredecessors();
188 }
189 
190 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
191   SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
192 
193   for (VPBlockBase *Block : Blocks)
194     delete Block;
195 }
196 
197 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
198   iterator It = begin();
199   while (It != end() && It->isPhi())
200     It++;
201   return It;
202 }
203 
204 Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
205   if (!Def->getDef())
206     return Def->getLiveInIRValue();
207 
208   if (hasScalarValue(Def, Instance)) {
209     return Data
210         .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
211   }
212 
213   assert(hasVectorValue(Def, Instance.Part));
214   auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
215   if (!VecPart->getType()->isVectorTy()) {
216     assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
217     return VecPart;
218   }
219   // TODO: Cache created scalar values.
220   Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
221   auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
222   // set(Def, Extract, Instance);
223   return Extract;
224 }
225 BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
226   VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
227   return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
228 }
229 
230 void VPTransformState::addNewMetadata(Instruction *To,
231                                       const Instruction *Orig) {
232   // If the loop was versioned with memchecks, add the corresponding no-alias
233   // metadata.
234   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
235     LVer->annotateInstWithNoAlias(To, Orig);
236 }
237 
238 void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
239   propagateMetadata(To, From);
240   addNewMetadata(To, From);
241 }
242 
243 void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
244   for (Value *V : To) {
245     if (Instruction *I = dyn_cast<Instruction>(V))
246       addMetadata(I, From);
247   }
248 }
249 
250 void VPTransformState::setDebugLocFromInst(const Value *V) {
251   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
252     const DILocation *DIL = Inst->getDebugLoc();
253 
254     // When a FSDiscriminator is enabled, we don't need to add the multiply
255     // factors to the discriminators.
256     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
257         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
258       // FIXME: For scalable vectors, assume vscale=1.
259       auto NewDIL =
260           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
261       if (NewDIL)
262         Builder.SetCurrentDebugLocation(*NewDIL);
263       else
264         LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
265                           << DIL->getFilename() << " Line: " << DIL->getLine());
266     } else
267       Builder.SetCurrentDebugLocation(DIL);
268   } else
269     Builder.SetCurrentDebugLocation(DebugLoc());
270 }
271 
272 BasicBlock *
273 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
274   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
275   // Pred stands for Predessor. Prev stands for Previous - last visited/created.
276   BasicBlock *PrevBB = CFG.PrevBB;
277   BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
278                                          PrevBB->getParent(), CFG.ExitBB);
279   LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
280 
281   // Hook up the new basic block to its predecessors.
282   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
283     VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
284     auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
285     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
286 
287     assert(PredBB && "Predecessor basic-block not found building successor.");
288     auto *PredBBTerminator = PredBB->getTerminator();
289     LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
290 
291     auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
292     if (isa<UnreachableInst>(PredBBTerminator)) {
293       assert(PredVPSuccessors.size() == 1 &&
294              "Predecessor ending w/o branch must have single successor.");
295       DebugLoc DL = PredBBTerminator->getDebugLoc();
296       PredBBTerminator->eraseFromParent();
297       auto *Br = BranchInst::Create(NewBB, PredBB);
298       Br->setDebugLoc(DL);
299     } else if (TermBr && !TermBr->isConditional()) {
300       TermBr->setSuccessor(0, NewBB);
301     } else {
302       // Set each forward successor here when it is created, excluding
303       // backedges. A backward successor is set when the branch is created.
304       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
305       assert(!TermBr->getSuccessor(idx) &&
306              "Trying to reset an existing successor block.");
307       TermBr->setSuccessor(idx, NewBB);
308     }
309   }
310   return NewBB;
311 }
312 
313 void VPBasicBlock::execute(VPTransformState *State) {
314   bool Replica = State->Instance && !State->Instance->isFirstIteration();
315   VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
316   VPBlockBase *SingleHPred = nullptr;
317   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
318 
319   auto IsLoopRegion = [](VPBlockBase *BB) {
320     auto *R = dyn_cast<VPRegionBlock>(BB);
321     return R && !R->isReplicator();
322   };
323 
324   // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
325   if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
326     // ExitBB can be re-used for the exit block of the Plan.
327     NewBB = State->CFG.ExitBB;
328     State->CFG.PrevBB = NewBB;
329 
330     // Update the branch instruction in the predecessor to branch to ExitBB.
331     VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
332     VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
333     assert(PredVPB->getSingleSuccessor() == this &&
334            "predecessor must have the current block as only successor");
335     BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
336     // The Exit block of a loop is always set to be successor 0 of the Exiting
337     // block.
338     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
339   } else if (PrevVPBB && /* A */
340              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
341                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
342                PrevVPBB->getSingleHierarchicalSuccessor() &&
343                (SingleHPred->getParent() == getEnclosingLoopRegion() &&
344                 !IsLoopRegion(SingleHPred))) &&         /* B */
345              !(Replica && getPredecessors().empty())) { /* C */
346     // The last IR basic block is reused, as an optimization, in three cases:
347     // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
348     // B. when the current VPBB has a single (hierarchical) predecessor which
349     //    is PrevVPBB and the latter has a single (hierarchical) successor which
350     //    both are in the same non-replicator region; and
351     // C. when the current VPBB is an entry of a region replica - where PrevVPBB
352     //    is the exiting VPBB of this region from a previous instance, or the
353     //    predecessor of this region.
354 
355     NewBB = createEmptyBasicBlock(State->CFG);
356     State->Builder.SetInsertPoint(NewBB);
357     // Temporarily terminate with unreachable until CFG is rewired.
358     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
359     // Register NewBB in its loop. In innermost loops its the same for all
360     // BB's.
361     if (State->CurrentVectorLoop)
362       State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
363     State->Builder.SetInsertPoint(Terminator);
364     State->CFG.PrevBB = NewBB;
365   }
366 
367   // 2. Fill the IR basic block with IR instructions.
368   LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
369                     << " in BB:" << NewBB->getName() << '\n');
370 
371   State->CFG.VPBB2IRBB[this] = NewBB;
372   State->CFG.PrevVPBB = this;
373 
374   for (VPRecipeBase &Recipe : Recipes)
375     Recipe.execute(*State);
376 
377   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
378 }
379 
380 void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
381   for (VPRecipeBase &R : Recipes) {
382     for (auto *Def : R.definedValues())
383       Def->replaceAllUsesWith(NewValue);
384 
385     for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
386       R.setOperand(I, NewValue);
387   }
388 }
389 
390 VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
391   assert((SplitAt == end() || SplitAt->getParent() == this) &&
392          "can only split at a position in the same block");
393 
394   SmallVector<VPBlockBase *, 2> Succs(successors());
395   // First, disconnect the current block from its successors.
396   for (VPBlockBase *Succ : Succs)
397     VPBlockUtils::disconnectBlocks(this, Succ);
398 
399   // Create new empty block after the block to split.
400   auto *SplitBlock = new VPBasicBlock(getName() + ".split");
401   VPBlockUtils::insertBlockAfter(SplitBlock, this);
402 
403   // Add successors for block to split to new block.
404   for (VPBlockBase *Succ : Succs)
405     VPBlockUtils::connectBlocks(SplitBlock, Succ);
406 
407   // Finally, move the recipes starting at SplitAt to new block.
408   for (VPRecipeBase &ToMove :
409        make_early_inc_range(make_range(SplitAt, this->end())))
410     ToMove.moveBefore(*SplitBlock, SplitBlock->end());
411 
412   return SplitBlock;
413 }
414 
415 VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
416   VPRegionBlock *P = getParent();
417   if (P && P->isReplicator()) {
418     P = P->getParent();
419     assert(!cast<VPRegionBlock>(P)->isReplicator() &&
420            "unexpected nested replicate regions");
421   }
422   return P;
423 }
424 
425 static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
426   if (VPBB->empty()) {
427     assert(
428         VPBB->getNumSuccessors() < 2 &&
429         "block with multiple successors doesn't have a recipe as terminator");
430     return false;
431   }
432 
433   const VPRecipeBase *R = &VPBB->back();
434   auto *VPI = dyn_cast<VPInstruction>(R);
435   bool IsCondBranch =
436       isa<VPBranchOnMaskRecipe>(R) ||
437       (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
438                VPI->getOpcode() == VPInstruction::BranchOnCount));
439   (void)IsCondBranch;
440 
441   if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
442     assert(IsCondBranch && "block with multiple successors not terminated by "
443                            "conditional branch recipe");
444 
445     return true;
446   }
447 
448   assert(
449       !IsCondBranch &&
450       "block with 0 or 1 successors terminated by conditional branch recipe");
451   return false;
452 }
453 
454 VPRecipeBase *VPBasicBlock::getTerminator() {
455   if (hasConditionalTerminator(this))
456     return &back();
457   return nullptr;
458 }
459 
460 const VPRecipeBase *VPBasicBlock::getTerminator() const {
461   if (hasConditionalTerminator(this))
462     return &back();
463   return nullptr;
464 }
465 
466 bool VPBasicBlock::isExiting() const {
467   return getParent()->getExitingBasicBlock() == this;
468 }
469 
470 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
471 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
472   if (getSuccessors().empty()) {
473     O << Indent << "No successors\n";
474   } else {
475     O << Indent << "Successor(s): ";
476     ListSeparator LS;
477     for (auto *Succ : getSuccessors())
478       O << LS << Succ->getName();
479     O << '\n';
480   }
481 }
482 
483 void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
484                          VPSlotTracker &SlotTracker) const {
485   O << Indent << getName() << ":\n";
486 
487   auto RecipeIndent = Indent + "  ";
488   for (const VPRecipeBase &Recipe : *this) {
489     Recipe.print(O, RecipeIndent, SlotTracker);
490     O << '\n';
491   }
492 
493   printSuccessors(O, Indent);
494 }
495 #endif
496 
497 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
498   for (VPBlockBase *Block : depth_first(Entry))
499     // Drop all references in VPBasicBlocks and replace all uses with
500     // DummyValue.
501     Block->dropAllReferences(NewValue);
502 }
503 
504 void VPRegionBlock::execute(VPTransformState *State) {
505   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
506 
507   if (!isReplicator()) {
508     // Create and register the new vector loop.
509     Loop *PrevLoop = State->CurrentVectorLoop;
510     State->CurrentVectorLoop = State->LI->AllocateLoop();
511     BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
512     Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
513 
514     // Insert the new loop into the loop nest and register the new basic blocks
515     // before calling any utilities such as SCEV that require valid LoopInfo.
516     if (ParentLoop)
517       ParentLoop->addChildLoop(State->CurrentVectorLoop);
518     else
519       State->LI->addTopLevelLoop(State->CurrentVectorLoop);
520 
521     // Visit the VPBlocks connected to "this", starting from it.
522     for (VPBlockBase *Block : RPOT) {
523       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
524       Block->execute(State);
525     }
526 
527     State->CurrentVectorLoop = PrevLoop;
528     return;
529   }
530 
531   assert(!State->Instance && "Replicating a Region with non-null instance.");
532 
533   // Enter replicating mode.
534   State->Instance = VPIteration(0, 0);
535 
536   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
537     State->Instance->Part = Part;
538     assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
539     for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
540          ++Lane) {
541       State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
542       // Visit the VPBlocks connected to \p this, starting from it.
543       for (VPBlockBase *Block : RPOT) {
544         LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
545         Block->execute(State);
546       }
547     }
548   }
549 
550   // Exit replicating mode.
551   State->Instance.reset();
552 }
553 
554 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
555 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
556                           VPSlotTracker &SlotTracker) const {
557   O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
558   auto NewIndent = Indent + "  ";
559   for (auto *BlockBase : depth_first(Entry)) {
560     O << '\n';
561     BlockBase->print(O, NewIndent, SlotTracker);
562   }
563   O << Indent << "}\n";
564 
565   printSuccessors(O, Indent);
566 }
567 #endif
568 
569 void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
570                              Value *CanonicalIVStartValue,
571                              VPTransformState &State,
572                              bool IsEpilogueVectorization) {
573 
574   VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
575   auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
576   // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
577   // preparing to execute the plan for the main vector loop.
578   if (!IsEpilogueVectorization && Term &&
579       Term->getOpcode() == VPInstruction::BranchOnCount &&
580       isa<ConstantInt>(TripCountV)) {
581     ConstantInt *C = cast<ConstantInt>(TripCountV);
582     uint64_t TCVal = C->getZExtValue();
583     if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
584       auto *BOC =
585           new VPInstruction(VPInstruction::BranchOnCond,
586                             {getOrAddExternalDef(State.Builder.getTrue())});
587       Term->eraseFromParent();
588       ExitingVPBB->appendRecipe(BOC);
589       // TODO: Further simplifications are possible
590       //      1. Replace inductions with constants.
591       //      2. Replace vector loop region with VPBasicBlock.
592     }
593   }
594 
595   // Check if the trip count is needed, and if so build it.
596   if (TripCount && TripCount->getNumUsers()) {
597     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
598       State.set(TripCount, TripCountV, Part);
599   }
600 
601   // Check if the backedge taken count is needed, and if so build it.
602   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
603     IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
604     auto *TCMO = Builder.CreateSub(TripCountV,
605                                    ConstantInt::get(TripCountV->getType(), 1),
606                                    "trip.count.minus.1");
607     auto VF = State.VF;
608     Value *VTCMO =
609         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
610     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
611       State.set(BackedgeTakenCount, VTCMO, Part);
612   }
613 
614   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
615     State.set(&VectorTripCount, VectorTripCountV, Part);
616 
617   // When vectorizing the epilogue loop, the canonical induction start value
618   // needs to be changed from zero to the value after the main vector loop.
619   if (CanonicalIVStartValue) {
620     VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
621     auto *IV = getCanonicalIV();
622     assert(all_of(IV->users(),
623                   [](const VPUser *U) {
624                     if (isa<VPScalarIVStepsRecipe>(U))
625                       return true;
626                     auto *VPI = cast<VPInstruction>(U);
627                     return VPI->getOpcode() ==
628                                VPInstruction::CanonicalIVIncrement ||
629                            VPI->getOpcode() ==
630                                VPInstruction::CanonicalIVIncrementNUW;
631                   }) &&
632            "the canonical IV should only be used by its increments or "
633            "ScalarIVSteps when "
634            "resetting the start value");
635     IV->setOperand(0, VPV);
636   }
637 }
638 
639 /// Generate the code inside the preheader and body of the vectorized loop.
640 /// Assumes a single pre-header basic-block was created for this. Introduce
641 /// additional basic-blocks as needed, and fill them all.
642 void VPlan::execute(VPTransformState *State) {
643   // Set the reverse mapping from VPValues to Values for code generation.
644   for (auto &Entry : Value2VPValue)
645     State->VPValue2Value[Entry.second] = Entry.first;
646 
647   // Initialize CFG state.
648   State->CFG.PrevVPBB = nullptr;
649   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
650   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
651   State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
652 
653   // Generate code in the loop pre-header and body.
654   for (VPBlockBase *Block : depth_first(Entry))
655     Block->execute(State);
656 
657   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
658   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
659 
660   // Fix the latch value of canonical, reduction and first-order recurrences
661   // phis in the vector loop.
662   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
663   for (VPRecipeBase &R : Header->phis()) {
664     // Skip phi-like recipes that generate their backedege values themselves.
665     if (isa<VPWidenPHIRecipe>(&R))
666       continue;
667 
668     if (isa<VPWidenPointerInductionRecipe>(&R) ||
669         isa<VPWidenIntOrFpInductionRecipe>(&R)) {
670       PHINode *Phi = nullptr;
671       if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
672         Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
673       } else {
674         auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
675         // TODO: Split off the case that all users of a pointer phi are scalar
676         // from the VPWidenPointerInductionRecipe.
677         if (WidenPhi->onlyScalarsGenerated(State->VF))
678           continue;
679 
680         auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
681         Phi = cast<PHINode>(GEP->getPointerOperand());
682       }
683 
684       Phi->setIncomingBlock(1, VectorLatchBB);
685 
686       // Move the last step to the end of the latch block. This ensures
687       // consistent placement of all induction updates.
688       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
689       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
690       continue;
691     }
692 
693     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
694     // For  canonical IV, first-order recurrences and in-order reduction phis,
695     // only a single part is generated, which provides the last part from the
696     // previous iteration. For non-ordered reductions all UF parts are
697     // generated.
698     bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
699                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
700                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
701     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
702 
703     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
704       Value *Phi = State->get(PhiR, Part);
705       Value *Val = State->get(PhiR->getBackedgeValue(),
706                               SinglePartNeeded ? State->UF - 1 : Part);
707       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
708     }
709   }
710 
711   // We do not attempt to preserve DT for outer loop vectorization currently.
712   if (!EnableVPlanNativePath) {
713     BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
714     State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
715     updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
716                         State->CFG.ExitBB);
717   }
718 }
719 
720 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
721 LLVM_DUMP_METHOD
722 void VPlan::print(raw_ostream &O) const {
723   VPSlotTracker SlotTracker(this);
724 
725   O << "VPlan '" << Name << "' {";
726 
727   if (VectorTripCount.getNumUsers() > 0) {
728     O << "\nLive-in ";
729     VectorTripCount.printAsOperand(O, SlotTracker);
730     O << " = vector-trip-count\n";
731   }
732 
733   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
734     O << "\nLive-in ";
735     BackedgeTakenCount->printAsOperand(O, SlotTracker);
736     O << " = backedge-taken count\n";
737   }
738 
739   for (const VPBlockBase *Block : depth_first(getEntry())) {
740     O << '\n';
741     Block->print(O, "", SlotTracker);
742   }
743 
744   if (!LiveOuts.empty())
745     O << "\n";
746   for (auto &KV : LiveOuts) {
747     O << "Live-out ";
748     KV.second->getPhi()->printAsOperand(O);
749     O << " = ";
750     KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
751     O << "\n";
752   }
753 
754   O << "}\n";
755 }
756 
757 LLVM_DUMP_METHOD
758 void VPlan::printDOT(raw_ostream &O) const {
759   VPlanPrinter Printer(O, *this);
760   Printer.dump();
761 }
762 
763 LLVM_DUMP_METHOD
764 void VPlan::dump() const { print(dbgs()); }
765 #endif
766 
767 void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
768   assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
769   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
770 }
771 
772 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
773                                 BasicBlock *LoopLatchBB,
774                                 BasicBlock *LoopExitBB) {
775   // The vector body may be more than a single basic-block by this point.
776   // Update the dominator tree information inside the vector body by propagating
777   // it from header to latch, expecting only triangular control-flow, if any.
778   BasicBlock *PostDomSucc = nullptr;
779   for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
780     // Get the list of successors of this block.
781     std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
782     assert(Succs.size() <= 2 &&
783            "Basic block in vector loop has more than 2 successors.");
784     PostDomSucc = Succs[0];
785     if (Succs.size() == 1) {
786       assert(PostDomSucc->getSinglePredecessor() &&
787              "PostDom successor has more than one predecessor.");
788       DT->addNewBlock(PostDomSucc, BB);
789       continue;
790     }
791     BasicBlock *InterimSucc = Succs[1];
792     if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
793       PostDomSucc = Succs[1];
794       InterimSucc = Succs[0];
795     }
796     assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
797            "One successor of a basic block does not lead to the other.");
798     assert(InterimSucc->getSinglePredecessor() &&
799            "Interim successor has more than one predecessor.");
800     assert(PostDomSucc->hasNPredecessors(2) &&
801            "PostDom successor has more than two predecessors.");
802     DT->addNewBlock(InterimSucc, BB);
803     DT->addNewBlock(PostDomSucc, BB);
804   }
805   // Latch block is a new dominator for the loop exit.
806   DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
807   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
808 }
809 
810 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
811 
812 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
813   return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
814          Twine(getOrCreateBID(Block));
815 }
816 
817 Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
818   const std::string &Name = Block->getName();
819   if (!Name.empty())
820     return Name;
821   return "VPB" + Twine(getOrCreateBID(Block));
822 }
823 
824 void VPlanPrinter::dump() {
825   Depth = 1;
826   bumpIndent(0);
827   OS << "digraph VPlan {\n";
828   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
829   if (!Plan.getName().empty())
830     OS << "\\n" << DOT::EscapeString(Plan.getName());
831   if (Plan.BackedgeTakenCount) {
832     OS << ", where:\\n";
833     Plan.BackedgeTakenCount->print(OS, SlotTracker);
834     OS << " := BackedgeTakenCount";
835   }
836   OS << "\"]\n";
837   OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
838   OS << "edge [fontname=Courier, fontsize=30]\n";
839   OS << "compound=true\n";
840 
841   for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
842     dumpBlock(Block);
843 
844   OS << "}\n";
845 }
846 
847 void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
848   if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
849     dumpBasicBlock(BasicBlock);
850   else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
851     dumpRegion(Region);
852   else
853     llvm_unreachable("Unsupported kind of VPBlock.");
854 }
855 
856 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
857                             bool Hidden, const Twine &Label) {
858   // Due to "dot" we print an edge between two regions as an edge between the
859   // exiting basic block and the entry basic of the respective regions.
860   const VPBlockBase *Tail = From->getExitingBasicBlock();
861   const VPBlockBase *Head = To->getEntryBasicBlock();
862   OS << Indent << getUID(Tail) << " -> " << getUID(Head);
863   OS << " [ label=\"" << Label << '\"';
864   if (Tail != From)
865     OS << " ltail=" << getUID(From);
866   if (Head != To)
867     OS << " lhead=" << getUID(To);
868   if (Hidden)
869     OS << "; splines=none";
870   OS << "]\n";
871 }
872 
873 void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
874   auto &Successors = Block->getSuccessors();
875   if (Successors.size() == 1)
876     drawEdge(Block, Successors.front(), false, "");
877   else if (Successors.size() == 2) {
878     drawEdge(Block, Successors.front(), false, "T");
879     drawEdge(Block, Successors.back(), false, "F");
880   } else {
881     unsigned SuccessorNumber = 0;
882     for (auto *Successor : Successors)
883       drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
884   }
885 }
886 
887 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
888   // Implement dot-formatted dump by performing plain-text dump into the
889   // temporary storage followed by some post-processing.
890   OS << Indent << getUID(BasicBlock) << " [label =\n";
891   bumpIndent(1);
892   std::string Str;
893   raw_string_ostream SS(Str);
894   // Use no indentation as we need to wrap the lines into quotes ourselves.
895   BasicBlock->print(SS, "", SlotTracker);
896 
897   // We need to process each line of the output separately, so split
898   // single-string plain-text dump.
899   SmallVector<StringRef, 0> Lines;
900   StringRef(Str).rtrim('\n').split(Lines, "\n");
901 
902   auto EmitLine = [&](StringRef Line, StringRef Suffix) {
903     OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
904   };
905 
906   // Don't need the "+" after the last line.
907   for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
908     EmitLine(Line, " +\n");
909   EmitLine(Lines.back(), "\n");
910 
911   bumpIndent(-1);
912   OS << Indent << "]\n";
913 
914   dumpEdges(BasicBlock);
915 }
916 
917 void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
918   OS << Indent << "subgraph " << getUID(Region) << " {\n";
919   bumpIndent(1);
920   OS << Indent << "fontname=Courier\n"
921      << Indent << "label=\""
922      << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
923      << DOT::EscapeString(Region->getName()) << "\"\n";
924   // Dump the blocks of the region.
925   assert(Region->getEntry() && "Region contains no inner blocks.");
926   for (const VPBlockBase *Block : depth_first(Region->getEntry()))
927     dumpBlock(Block);
928   bumpIndent(-1);
929   OS << Indent << "}\n";
930   dumpEdges(Region);
931 }
932 
933 void VPlanIngredient::print(raw_ostream &O) const {
934   if (auto *Inst = dyn_cast<Instruction>(V)) {
935     if (!Inst->getType()->isVoidTy()) {
936       Inst->printAsOperand(O, false);
937       O << " = ";
938     }
939     O << Inst->getOpcodeName() << " ";
940     unsigned E = Inst->getNumOperands();
941     if (E > 0) {
942       Inst->getOperand(0)->printAsOperand(O, false);
943       for (unsigned I = 1; I < E; ++I)
944         Inst->getOperand(I)->printAsOperand(O << ", ", false);
945     }
946   } else // !Inst
947     V->printAsOperand(O, false);
948 }
949 
950 #endif
951 
952 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
953 
954 void VPValue::replaceAllUsesWith(VPValue *New) {
955   for (unsigned J = 0; J < getNumUsers();) {
956     VPUser *User = Users[J];
957     unsigned NumUsers = getNumUsers();
958     for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
959       if (User->getOperand(I) == this)
960         User->setOperand(I, New);
961     // If a user got removed after updating the current user, the next user to
962     // update will be moved to the current position, so we only need to
963     // increment the index if the number of users did not change.
964     if (NumUsers == getNumUsers())
965       J++;
966   }
967 }
968 
969 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
970 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
971   if (const Value *UV = getUnderlyingValue()) {
972     OS << "ir<";
973     UV->printAsOperand(OS, false);
974     OS << ">";
975     return;
976   }
977 
978   unsigned Slot = Tracker.getSlot(this);
979   if (Slot == unsigned(-1))
980     OS << "<badref>";
981   else
982     OS << "vp<%" << Tracker.getSlot(this) << ">";
983 }
984 
985 void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
986   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
987     Op->printAsOperand(O, SlotTracker);
988   });
989 }
990 #endif
991 
992 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
993                                           Old2NewTy &Old2New,
994                                           InterleavedAccessInfo &IAI) {
995   ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
996   for (VPBlockBase *Base : RPOT) {
997     visitBlock(Base, Old2New, IAI);
998   }
999 }
1000 
1001 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
1002                                          InterleavedAccessInfo &IAI) {
1003   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
1004     for (VPRecipeBase &VPI : *VPBB) {
1005       if (isa<VPHeaderPHIRecipe>(&VPI))
1006         continue;
1007       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
1008       auto *VPInst = cast<VPInstruction>(&VPI);
1009 
1010       auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
1011       if (!Inst)
1012         continue;
1013       auto *IG = IAI.getInterleaveGroup(Inst);
1014       if (!IG)
1015         continue;
1016 
1017       auto NewIGIter = Old2New.find(IG);
1018       if (NewIGIter == Old2New.end())
1019         Old2New[IG] = new InterleaveGroup<VPInstruction>(
1020             IG->getFactor(), IG->isReverse(), IG->getAlign());
1021 
1022       if (Inst == IG->getInsertPos())
1023         Old2New[IG]->setInsertPos(VPInst);
1024 
1025       InterleaveGroupMap[VPInst] = Old2New[IG];
1026       InterleaveGroupMap[VPInst]->insertMember(
1027           VPInst, IG->getIndex(Inst),
1028           Align(IG->isReverse() ? (-1) * int(IG->getFactor())
1029                                 : IG->getFactor()));
1030     }
1031   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1032     visitRegion(Region, Old2New, IAI);
1033   else
1034     llvm_unreachable("Unsupported kind of VPBlock.");
1035 }
1036 
1037 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
1038                                                  InterleavedAccessInfo &IAI) {
1039   Old2NewTy Old2New;
1040   visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
1041 }
1042 
1043 void VPSlotTracker::assignSlot(const VPValue *V) {
1044   assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
1045   Slots[V] = NextSlot++;
1046 }
1047 
1048 void VPSlotTracker::assignSlots(const VPlan &Plan) {
1049 
1050   for (const auto &P : Plan.VPExternalDefs)
1051     assignSlot(P.second);
1052 
1053   assignSlot(&Plan.VectorTripCount);
1054   if (Plan.BackedgeTakenCount)
1055     assignSlot(Plan.BackedgeTakenCount);
1056 
1057   ReversePostOrderTraversal<
1058       VPBlockRecursiveTraversalWrapper<const VPBlockBase *>>
1059       RPOT(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
1060           Plan.getEntry()));
1061   for (const VPBasicBlock *VPBB :
1062        VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
1063     for (const VPRecipeBase &Recipe : *VPBB)
1064       for (VPValue *Def : Recipe.definedValues())
1065         assignSlot(Def);
1066 }
1067 
1068 bool vputils::onlyFirstLaneUsed(VPValue *Def) {
1069   return all_of(Def->users(),
1070                 [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
1071 }
1072 
1073 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
1074                                                 ScalarEvolution &SE) {
1075   if (auto *E = dyn_cast<SCEVConstant>(Expr))
1076     return Plan.getOrAddExternalDef(E->getValue());
1077   if (auto *E = dyn_cast<SCEVUnknown>(Expr))
1078     return Plan.getOrAddExternalDef(E->getValue());
1079 
1080   VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
1081   VPValue *Step = new VPExpandSCEVRecipe(Expr, SE);
1082   Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef()));
1083   return Step;
1084 }
1085