xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp (revision 569d84fe99e63e830ea036598f7fa7a5f9899d7c)
1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This is the LLVM vectorization plan. It represents a candidate for
11 /// vectorization, allowing to plan and optimize how to vectorize a given loop
12 /// before generating LLVM-IR.
13 /// The vectorizer uses vectorization plans to estimate the costs of potential
14 /// candidates and if profitable to execute the desired plan, generating vector
15 /// LLVM-IR code.
16 ///
17 //===----------------------------------------------------------------------===//
18 
19 #include "VPlan.h"
20 #include "VPlanDominatorTree.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Twine.h"
26 #include "llvm/Analysis/IVDescriptors.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/CFG.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/Value.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/ErrorHandling.h"
39 #include "llvm/Support/GenericDomTreeConstruction.h"
40 #include "llvm/Support/GraphWriter.h"
41 #include "llvm/Support/raw_ostream.h"
42 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
43 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
44 #include <cassert>
45 #include <string>
46 #include <vector>
47 
48 using namespace llvm;
49 extern cl::opt<bool> EnableVPlanNativePath;
50 
51 #define DEBUG_TYPE "vplan"
52 
53 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
54 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
55   const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
56   VPSlotTracker SlotTracker(
57       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
58   V.print(OS, SlotTracker);
59   return OS;
60 }
61 #endif
62 
63 Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
64                                 const ElementCount &VF) const {
65   switch (LaneKind) {
66   case VPLane::Kind::ScalableLast:
67     // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
68     return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
69                              Builder.getInt32(VF.getKnownMinValue() - Lane));
70   case VPLane::Kind::First:
71     return Builder.getInt32(Lane);
72   }
73   llvm_unreachable("Unknown lane kind");
74 }
75 
76 VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
77     : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
78   if (Def)
79     Def->addDefinedValue(this);
80 }
81 
82 VPValue::~VPValue() {
83   assert(Users.empty() && "trying to delete a VPValue with remaining users");
84   if (Def)
85     Def->removeDefinedValue(this);
86 }
87 
88 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
89 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
90   if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
91     R->print(OS, "", SlotTracker);
92   else
93     printAsOperand(OS, SlotTracker);
94 }
95 
96 void VPValue::dump() const {
97   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
98   VPSlotTracker SlotTracker(
99       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
100   print(dbgs(), SlotTracker);
101   dbgs() << "\n";
102 }
103 
104 void VPDef::dump() const {
105   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
106   VPSlotTracker SlotTracker(
107       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
108   print(dbgs(), "", SlotTracker);
109   dbgs() << "\n";
110 }
111 #endif
112 
113 // Get the top-most entry block of \p Start. This is the entry block of the
114 // containing VPlan. This function is templated to support both const and non-const blocks
115 template <typename T> static T *getPlanEntry(T *Start) {
116   T *Next = Start;
117   T *Current = Start;
118   while ((Next = Next->getParent()))
119     Current = Next;
120 
121   SmallSetVector<T *, 8> WorkList;
122   WorkList.insert(Current);
123 
124   for (unsigned i = 0; i < WorkList.size(); i++) {
125     T *Current = WorkList[i];
126     if (Current->getNumPredecessors() == 0)
127       return Current;
128     auto &Predecessors = Current->getPredecessors();
129     WorkList.insert(Predecessors.begin(), Predecessors.end());
130   }
131 
132   llvm_unreachable("VPlan without any entry node without predecessors");
133 }
134 
135 VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
136 
137 const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
138 
139 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
140 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
141   const VPBlockBase *Block = this;
142   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
143     Block = Region->getEntry();
144   return cast<VPBasicBlock>(Block);
145 }
146 
147 VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
148   VPBlockBase *Block = this;
149   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
150     Block = Region->getEntry();
151   return cast<VPBasicBlock>(Block);
152 }
153 
154 void VPBlockBase::setPlan(VPlan *ParentPlan) {
155   assert(ParentPlan->getEntry() == this &&
156          "Can only set plan on its entry block.");
157   Plan = ParentPlan;
158 }
159 
160 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
161 const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
162   const VPBlockBase *Block = this;
163   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
164     Block = Region->getExiting();
165   return cast<VPBasicBlock>(Block);
166 }
167 
168 VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
169   VPBlockBase *Block = this;
170   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
171     Block = Region->getExiting();
172   return cast<VPBasicBlock>(Block);
173 }
174 
175 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
176   if (!Successors.empty() || !Parent)
177     return this;
178   assert(Parent->getExiting() == this &&
179          "Block w/o successors not the exiting block of its parent.");
180   return Parent->getEnclosingBlockWithSuccessors();
181 }
182 
183 VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
184   if (!Predecessors.empty() || !Parent)
185     return this;
186   assert(Parent->getEntry() == this &&
187          "Block w/o predecessors not the entry of its parent.");
188   return Parent->getEnclosingBlockWithPredecessors();
189 }
190 
191 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
192   SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
193 
194   for (VPBlockBase *Block : Blocks)
195     delete Block;
196 }
197 
198 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
199   iterator It = begin();
200   while (It != end() && It->isPhi())
201     It++;
202   return It;
203 }
204 
205 Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
206   if (!Def->getDef())
207     return Def->getLiveInIRValue();
208 
209   if (hasScalarValue(Def, Instance)) {
210     return Data
211         .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
212   }
213 
214   assert(hasVectorValue(Def, Instance.Part));
215   auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
216   if (!VecPart->getType()->isVectorTy()) {
217     assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
218     return VecPart;
219   }
220   // TODO: Cache created scalar values.
221   Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
222   auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
223   // set(Def, Extract, Instance);
224   return Extract;
225 }
226 BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
227   VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
228   return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
229 }
230 
231 BasicBlock *
232 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
233   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
234   // Pred stands for Predessor. Prev stands for Previous - last visited/created.
235   BasicBlock *PrevBB = CFG.PrevBB;
236   BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
237                                          PrevBB->getParent(), CFG.ExitBB);
238   LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
239 
240   // Hook up the new basic block to its predecessors.
241   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
242     VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
243     auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
244     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
245 
246     assert(PredBB && "Predecessor basic-block not found building successor.");
247     auto *PredBBTerminator = PredBB->getTerminator();
248     LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
249 
250     auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
251     if (isa<UnreachableInst>(PredBBTerminator)) {
252       assert(PredVPSuccessors.size() == 1 &&
253              "Predecessor ending w/o branch must have single successor.");
254       DebugLoc DL = PredBBTerminator->getDebugLoc();
255       PredBBTerminator->eraseFromParent();
256       auto *Br = BranchInst::Create(NewBB, PredBB);
257       Br->setDebugLoc(DL);
258     } else if (TermBr && !TermBr->isConditional()) {
259       TermBr->setSuccessor(0, NewBB);
260     } else {
261       // Set each forward successor here when it is created, excluding
262       // backedges. A backward successor is set when the branch is created.
263       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
264       assert(!TermBr->getSuccessor(idx) &&
265              "Trying to reset an existing successor block.");
266       TermBr->setSuccessor(idx, NewBB);
267     }
268   }
269   return NewBB;
270 }
271 
272 void VPBasicBlock::execute(VPTransformState *State) {
273   bool Replica = State->Instance && !State->Instance->isFirstIteration();
274   VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
275   VPBlockBase *SingleHPred = nullptr;
276   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
277 
278   auto IsLoopRegion = [](VPBlockBase *BB) {
279     auto *R = dyn_cast<VPRegionBlock>(BB);
280     return R && !R->isReplicator();
281   };
282 
283   // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
284   if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
285     // ExitBB can be re-used for the exit block of the Plan.
286     NewBB = State->CFG.ExitBB;
287     State->CFG.PrevBB = NewBB;
288 
289     // Update the branch instruction in the predecessor to branch to ExitBB.
290     VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
291     VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
292     assert(PredVPB->getSingleSuccessor() == this &&
293            "predecessor must have the current block as only successor");
294     BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
295     // The Exit block of a loop is always set to be successor 0 of the Exiting
296     // block.
297     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
298   } else if (PrevVPBB && /* A */
299              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
300                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
301                PrevVPBB->getSingleHierarchicalSuccessor() &&
302                (SingleHPred->getParent() == getEnclosingLoopRegion() &&
303                 !IsLoopRegion(SingleHPred))) &&         /* B */
304              !(Replica && getPredecessors().empty())) { /* C */
305     // The last IR basic block is reused, as an optimization, in three cases:
306     // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
307     // B. when the current VPBB has a single (hierarchical) predecessor which
308     //    is PrevVPBB and the latter has a single (hierarchical) successor which
309     //    both are in the same non-replicator region; and
310     // C. when the current VPBB is an entry of a region replica - where PrevVPBB
311     //    is the exiting VPBB of this region from a previous instance, or the
312     //    predecessor of this region.
313 
314     NewBB = createEmptyBasicBlock(State->CFG);
315     State->Builder.SetInsertPoint(NewBB);
316     // Temporarily terminate with unreachable until CFG is rewired.
317     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
318     // Register NewBB in its loop. In innermost loops its the same for all
319     // BB's.
320     if (State->CurrentVectorLoop)
321       State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
322     State->Builder.SetInsertPoint(Terminator);
323     State->CFG.PrevBB = NewBB;
324   }
325 
326   // 2. Fill the IR basic block with IR instructions.
327   LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
328                     << " in BB:" << NewBB->getName() << '\n');
329 
330   State->CFG.VPBB2IRBB[this] = NewBB;
331   State->CFG.PrevVPBB = this;
332 
333   for (VPRecipeBase &Recipe : Recipes)
334     Recipe.execute(*State);
335 
336   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
337 }
338 
339 void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
340   for (VPRecipeBase &R : Recipes) {
341     for (auto *Def : R.definedValues())
342       Def->replaceAllUsesWith(NewValue);
343 
344     for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
345       R.setOperand(I, NewValue);
346   }
347 }
348 
349 VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
350   assert((SplitAt == end() || SplitAt->getParent() == this) &&
351          "can only split at a position in the same block");
352 
353   SmallVector<VPBlockBase *, 2> Succs(successors());
354   // First, disconnect the current block from its successors.
355   for (VPBlockBase *Succ : Succs)
356     VPBlockUtils::disconnectBlocks(this, Succ);
357 
358   // Create new empty block after the block to split.
359   auto *SplitBlock = new VPBasicBlock(getName() + ".split");
360   VPBlockUtils::insertBlockAfter(SplitBlock, this);
361 
362   // Add successors for block to split to new block.
363   for (VPBlockBase *Succ : Succs)
364     VPBlockUtils::connectBlocks(SplitBlock, Succ);
365 
366   // Finally, move the recipes starting at SplitAt to new block.
367   for (VPRecipeBase &ToMove :
368        make_early_inc_range(make_range(SplitAt, this->end())))
369     ToMove.moveBefore(*SplitBlock, SplitBlock->end());
370 
371   return SplitBlock;
372 }
373 
374 VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
375   VPRegionBlock *P = getParent();
376   if (P && P->isReplicator()) {
377     P = P->getParent();
378     assert(!cast<VPRegionBlock>(P)->isReplicator() &&
379            "unexpected nested replicate regions");
380   }
381   return P;
382 }
383 
384 static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
385   if (VPBB->empty()) {
386     assert(
387         VPBB->getNumSuccessors() < 2 &&
388         "block with multiple successors doesn't have a recipe as terminator");
389     return false;
390   }
391 
392   const VPRecipeBase *R = &VPBB->back();
393   auto *VPI = dyn_cast<VPInstruction>(R);
394   bool IsCondBranch =
395       isa<VPBranchOnMaskRecipe>(R) ||
396       (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
397                VPI->getOpcode() == VPInstruction::BranchOnCount));
398   (void)IsCondBranch;
399 
400   if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
401     assert(IsCondBranch && "block with multiple successors not terminated by "
402                            "conditional branch recipe");
403 
404     return true;
405   }
406 
407   assert(
408       !IsCondBranch &&
409       "block with 0 or 1 successors terminated by conditional branch recipe");
410   return false;
411 }
412 
413 VPRecipeBase *VPBasicBlock::getTerminator() {
414   if (hasConditionalTerminator(this))
415     return &back();
416   return nullptr;
417 }
418 
419 const VPRecipeBase *VPBasicBlock::getTerminator() const {
420   if (hasConditionalTerminator(this))
421     return &back();
422   return nullptr;
423 }
424 
425 bool VPBasicBlock::isExiting() const {
426   return getParent()->getExitingBasicBlock() == this;
427 }
428 
429 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
430 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
431   if (getSuccessors().empty()) {
432     O << Indent << "No successors\n";
433   } else {
434     O << Indent << "Successor(s): ";
435     ListSeparator LS;
436     for (auto *Succ : getSuccessors())
437       O << LS << Succ->getName();
438     O << '\n';
439   }
440 }
441 
442 void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
443                          VPSlotTracker &SlotTracker) const {
444   O << Indent << getName() << ":\n";
445 
446   auto RecipeIndent = Indent + "  ";
447   for (const VPRecipeBase &Recipe : *this) {
448     Recipe.print(O, RecipeIndent, SlotTracker);
449     O << '\n';
450   }
451 
452   printSuccessors(O, Indent);
453 }
454 #endif
455 
456 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
457   for (VPBlockBase *Block : depth_first(Entry))
458     // Drop all references in VPBasicBlocks and replace all uses with
459     // DummyValue.
460     Block->dropAllReferences(NewValue);
461 }
462 
463 void VPRegionBlock::execute(VPTransformState *State) {
464   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
465 
466   if (!isReplicator()) {
467     // Create and register the new vector loop.
468     Loop *PrevLoop = State->CurrentVectorLoop;
469     State->CurrentVectorLoop = State->LI->AllocateLoop();
470     BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
471     Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
472 
473     // Insert the new loop into the loop nest and register the new basic blocks
474     // before calling any utilities such as SCEV that require valid LoopInfo.
475     if (ParentLoop)
476       ParentLoop->addChildLoop(State->CurrentVectorLoop);
477     else
478       State->LI->addTopLevelLoop(State->CurrentVectorLoop);
479 
480     // Visit the VPBlocks connected to "this", starting from it.
481     for (VPBlockBase *Block : RPOT) {
482       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
483       Block->execute(State);
484     }
485 
486     State->CurrentVectorLoop = PrevLoop;
487     return;
488   }
489 
490   assert(!State->Instance && "Replicating a Region with non-null instance.");
491 
492   // Enter replicating mode.
493   State->Instance = VPIteration(0, 0);
494 
495   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
496     State->Instance->Part = Part;
497     assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
498     for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
499          ++Lane) {
500       State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
501       // Visit the VPBlocks connected to \p this, starting from it.
502       for (VPBlockBase *Block : RPOT) {
503         LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
504         Block->execute(State);
505       }
506     }
507   }
508 
509   // Exit replicating mode.
510   State->Instance.reset();
511 }
512 
513 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
514 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
515                           VPSlotTracker &SlotTracker) const {
516   O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
517   auto NewIndent = Indent + "  ";
518   for (auto *BlockBase : depth_first(Entry)) {
519     O << '\n';
520     BlockBase->print(O, NewIndent, SlotTracker);
521   }
522   O << Indent << "}\n";
523 
524   printSuccessors(O, Indent);
525 }
526 #endif
527 
528 bool VPRecipeBase::mayWriteToMemory() const {
529   switch (getVPDefID()) {
530   case VPWidenMemoryInstructionSC: {
531     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
532   }
533   case VPReplicateSC:
534   case VPWidenCallSC:
535     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
536         ->mayWriteToMemory();
537   case VPBranchOnMaskSC:
538     return false;
539   case VPWidenIntOrFpInductionSC:
540   case VPWidenCanonicalIVSC:
541   case VPWidenPHISC:
542   case VPBlendSC:
543   case VPWidenSC:
544   case VPWidenGEPSC:
545   case VPReductionSC:
546   case VPWidenSelectSC: {
547     const Instruction *I =
548         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
549     (void)I;
550     assert((!I || !I->mayWriteToMemory()) &&
551            "underlying instruction may write to memory");
552     return false;
553   }
554   default:
555     return true;
556   }
557 }
558 
559 bool VPRecipeBase::mayReadFromMemory() const {
560   switch (getVPDefID()) {
561   case VPWidenMemoryInstructionSC: {
562     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
563   }
564   case VPReplicateSC:
565   case VPWidenCallSC:
566     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
567         ->mayReadFromMemory();
568   case VPBranchOnMaskSC:
569     return false;
570   case VPWidenIntOrFpInductionSC:
571   case VPWidenCanonicalIVSC:
572   case VPWidenPHISC:
573   case VPBlendSC:
574   case VPWidenSC:
575   case VPWidenGEPSC:
576   case VPReductionSC:
577   case VPWidenSelectSC: {
578     const Instruction *I =
579         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
580     (void)I;
581     assert((!I || !I->mayReadFromMemory()) &&
582            "underlying instruction may read from memory");
583     return false;
584   }
585   default:
586     return true;
587   }
588 }
589 
590 bool VPRecipeBase::mayHaveSideEffects() const {
591   switch (getVPDefID()) {
592   case VPWidenIntOrFpInductionSC:
593   case VPWidenPointerInductionSC:
594   case VPWidenCanonicalIVSC:
595   case VPWidenPHISC:
596   case VPBlendSC:
597   case VPWidenSC:
598   case VPWidenGEPSC:
599   case VPReductionSC:
600   case VPWidenSelectSC:
601   case VPScalarIVStepsSC: {
602     const Instruction *I =
603         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
604     (void)I;
605     assert((!I || !I->mayHaveSideEffects()) &&
606            "underlying instruction has side-effects");
607     return false;
608   }
609   case VPReplicateSC: {
610     auto *R = cast<VPReplicateRecipe>(this);
611     return R->getUnderlyingInstr()->mayHaveSideEffects();
612   }
613   default:
614     return true;
615   }
616 }
617 
618 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
619   auto Lane = VPLane::getLastLaneForVF(State.VF);
620   VPValue *ExitValue = getOperand(0);
621   if (Plan.isUniformAfterVectorization(ExitValue))
622     Lane = VPLane::getFirstLane();
623   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
624                    State.Builder.GetInsertBlock());
625 }
626 
627 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
628   assert(!Parent && "Recipe already in some VPBasicBlock");
629   assert(InsertPos->getParent() &&
630          "Insertion position not in any VPBasicBlock");
631   Parent = InsertPos->getParent();
632   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
633 }
634 
635 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
636                                 iplist<VPRecipeBase>::iterator I) {
637   assert(!Parent && "Recipe already in some VPBasicBlock");
638   assert(I == BB.end() || I->getParent() == &BB);
639   Parent = &BB;
640   BB.getRecipeList().insert(I, this);
641 }
642 
643 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
644   assert(!Parent && "Recipe already in some VPBasicBlock");
645   assert(InsertPos->getParent() &&
646          "Insertion position not in any VPBasicBlock");
647   Parent = InsertPos->getParent();
648   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
649 }
650 
651 void VPRecipeBase::removeFromParent() {
652   assert(getParent() && "Recipe not in any VPBasicBlock");
653   getParent()->getRecipeList().remove(getIterator());
654   Parent = nullptr;
655 }
656 
657 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
658   assert(getParent() && "Recipe not in any VPBasicBlock");
659   return getParent()->getRecipeList().erase(getIterator());
660 }
661 
662 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
663   removeFromParent();
664   insertAfter(InsertPos);
665 }
666 
667 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
668                               iplist<VPRecipeBase>::iterator I) {
669   removeFromParent();
670   insertBefore(BB, I);
671 }
672 
673 void VPInstruction::generateInstruction(VPTransformState &State,
674                                         unsigned Part) {
675   IRBuilderBase &Builder = State.Builder;
676   Builder.SetCurrentDebugLocation(DL);
677 
678   if (Instruction::isBinaryOp(getOpcode())) {
679     Value *A = State.get(getOperand(0), Part);
680     Value *B = State.get(getOperand(1), Part);
681     Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
682     State.set(this, V, Part);
683     return;
684   }
685 
686   switch (getOpcode()) {
687   case VPInstruction::Not: {
688     Value *A = State.get(getOperand(0), Part);
689     Value *V = Builder.CreateNot(A);
690     State.set(this, V, Part);
691     break;
692   }
693   case VPInstruction::ICmpULE: {
694     Value *IV = State.get(getOperand(0), Part);
695     Value *TC = State.get(getOperand(1), Part);
696     Value *V = Builder.CreateICmpULE(IV, TC);
697     State.set(this, V, Part);
698     break;
699   }
700   case Instruction::Select: {
701     Value *Cond = State.get(getOperand(0), Part);
702     Value *Op1 = State.get(getOperand(1), Part);
703     Value *Op2 = State.get(getOperand(2), Part);
704     Value *V = Builder.CreateSelect(Cond, Op1, Op2);
705     State.set(this, V, Part);
706     break;
707   }
708   case VPInstruction::ActiveLaneMask: {
709     // Get first lane of vector induction variable.
710     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
711     // Get the original loop tripcount.
712     Value *ScalarTC = State.get(getOperand(1), Part);
713 
714     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
715     auto *PredTy = VectorType::get(Int1Ty, State.VF);
716     Instruction *Call = Builder.CreateIntrinsic(
717         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
718         {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
719     State.set(this, Call, Part);
720     break;
721   }
722   case VPInstruction::FirstOrderRecurrenceSplice: {
723     // Generate code to combine the previous and current values in vector v3.
724     //
725     //   vector.ph:
726     //     v_init = vector(..., ..., ..., a[-1])
727     //     br vector.body
728     //
729     //   vector.body
730     //     i = phi [0, vector.ph], [i+4, vector.body]
731     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
732     //     v2 = a[i, i+1, i+2, i+3];
733     //     v3 = vector(v1(3), v2(0, 1, 2))
734 
735     // For the first part, use the recurrence phi (v1), otherwise v2.
736     auto *V1 = State.get(getOperand(0), 0);
737     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
738     if (!PartMinus1->getType()->isVectorTy()) {
739       State.set(this, PartMinus1, Part);
740     } else {
741       Value *V2 = State.get(getOperand(1), Part);
742       State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part);
743     }
744     break;
745   }
746 
747   case VPInstruction::CanonicalIVIncrement:
748   case VPInstruction::CanonicalIVIncrementNUW: {
749     Value *Next = nullptr;
750     if (Part == 0) {
751       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
752       auto *Phi = State.get(getOperand(0), 0);
753       // The loop step is equal to the vectorization factor (num of SIMD
754       // elements) times the unroll factor (num of SIMD instructions).
755       Value *Step =
756           createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
757       Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
758     } else {
759       Next = State.get(this, 0);
760     }
761 
762     State.set(this, Next, Part);
763     break;
764   }
765   case VPInstruction::BranchOnCond: {
766     if (Part != 0)
767       break;
768 
769     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
770     VPRegionBlock *ParentRegion = getParent()->getParent();
771     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
772 
773     // Replace the temporary unreachable terminator with a new conditional
774     // branch, hooking it up to backward destination for exiting blocks now and
775     // to forward destination(s) later when they are created.
776     BranchInst *CondBr =
777         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
778 
779     if (getParent()->isExiting())
780       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
781 
782     CondBr->setSuccessor(0, nullptr);
783     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
784     break;
785   }
786   case VPInstruction::BranchOnCount: {
787     if (Part != 0)
788       break;
789     // First create the compare.
790     Value *IV = State.get(getOperand(0), Part);
791     Value *TC = State.get(getOperand(1), Part);
792     Value *Cond = Builder.CreateICmpEQ(IV, TC);
793 
794     // Now create the branch.
795     auto *Plan = getParent()->getPlan();
796     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
797     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
798 
799     // Replace the temporary unreachable terminator with a new conditional
800     // branch, hooking it up to backward destination (the header) now and to the
801     // forward destination (the exit/middle block) later when it is created.
802     // Note that CreateCondBr expects a valid BB as first argument, so we need
803     // to set it to nullptr later.
804     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
805                                               State.CFG.VPBB2IRBB[Header]);
806     CondBr->setSuccessor(0, nullptr);
807     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
808     break;
809   }
810   default:
811     llvm_unreachable("Unsupported opcode for instruction");
812   }
813 }
814 
815 void VPInstruction::execute(VPTransformState &State) {
816   assert(!State.Instance && "VPInstruction executing an Instance");
817   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
818   State.Builder.setFastMathFlags(FMF);
819   for (unsigned Part = 0; Part < State.UF; ++Part)
820     generateInstruction(State, Part);
821 }
822 
823 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
824 void VPInstruction::dump() const {
825   VPSlotTracker SlotTracker(getParent()->getPlan());
826   print(dbgs(), "", SlotTracker);
827 }
828 
829 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
830                           VPSlotTracker &SlotTracker) const {
831   O << Indent << "EMIT ";
832 
833   if (hasResult()) {
834     printAsOperand(O, SlotTracker);
835     O << " = ";
836   }
837 
838   switch (getOpcode()) {
839   case VPInstruction::Not:
840     O << "not";
841     break;
842   case VPInstruction::ICmpULE:
843     O << "icmp ule";
844     break;
845   case VPInstruction::SLPLoad:
846     O << "combined load";
847     break;
848   case VPInstruction::SLPStore:
849     O << "combined store";
850     break;
851   case VPInstruction::ActiveLaneMask:
852     O << "active lane mask";
853     break;
854   case VPInstruction::FirstOrderRecurrenceSplice:
855     O << "first-order splice";
856     break;
857   case VPInstruction::CanonicalIVIncrement:
858     O << "VF * UF + ";
859     break;
860   case VPInstruction::CanonicalIVIncrementNUW:
861     O << "VF * UF +(nuw) ";
862     break;
863   case VPInstruction::BranchOnCond:
864     O << "branch-on-cond";
865     break;
866   case VPInstruction::BranchOnCount:
867     O << "branch-on-count ";
868     break;
869   default:
870     O << Instruction::getOpcodeName(getOpcode());
871   }
872 
873   O << FMF;
874 
875   for (const VPValue *Operand : operands()) {
876     O << " ";
877     Operand->printAsOperand(O, SlotTracker);
878   }
879 
880   if (DL) {
881     O << ", !dbg ";
882     DL.print(O);
883   }
884 }
885 #endif
886 
887 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
888   // Make sure the VPInstruction is a floating-point operation.
889   assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
890           Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
891           Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
892           Opcode == Instruction::FCmp) &&
893          "this op can't take fast-math flags");
894   FMF = FMFNew;
895 }
896 
897 void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
898                              Value *CanonicalIVStartValue,
899                              VPTransformState &State) {
900 
901   VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
902   auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
903   // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
904   // preparing to execute the plan for the main vector loop.
905   if (!CanonicalIVStartValue && Term &&
906       Term->getOpcode() == VPInstruction::BranchOnCount &&
907       isa<ConstantInt>(TripCountV)) {
908     ConstantInt *C = cast<ConstantInt>(TripCountV);
909     uint64_t TCVal = C->getZExtValue();
910     if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
911       auto *BOC =
912           new VPInstruction(VPInstruction::BranchOnCond,
913                             {getOrAddExternalDef(State.Builder.getTrue())});
914       Term->eraseFromParent();
915       ExitingVPBB->appendRecipe(BOC);
916       // TODO: Further simplifications are possible
917       //      1. Replace inductions with constants.
918       //      2. Replace vector loop region with VPBasicBlock.
919     }
920   }
921 
922   // Check if the trip count is needed, and if so build it.
923   if (TripCount && TripCount->getNumUsers()) {
924     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
925       State.set(TripCount, TripCountV, Part);
926   }
927 
928   // Check if the backedge taken count is needed, and if so build it.
929   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
930     IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
931     auto *TCMO = Builder.CreateSub(TripCountV,
932                                    ConstantInt::get(TripCountV->getType(), 1),
933                                    "trip.count.minus.1");
934     auto VF = State.VF;
935     Value *VTCMO =
936         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
937     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
938       State.set(BackedgeTakenCount, VTCMO, Part);
939   }
940 
941   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
942     State.set(&VectorTripCount, VectorTripCountV, Part);
943 
944   // When vectorizing the epilogue loop, the canonical induction start value
945   // needs to be changed from zero to the value after the main vector loop.
946   if (CanonicalIVStartValue) {
947     VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
948     auto *IV = getCanonicalIV();
949     assert(all_of(IV->users(),
950                   [](const VPUser *U) {
951                     if (isa<VPScalarIVStepsRecipe>(U))
952                       return true;
953                     auto *VPI = cast<VPInstruction>(U);
954                     return VPI->getOpcode() ==
955                                VPInstruction::CanonicalIVIncrement ||
956                            VPI->getOpcode() ==
957                                VPInstruction::CanonicalIVIncrementNUW;
958                   }) &&
959            "the canonical IV should only be used by its increments or "
960            "ScalarIVSteps when "
961            "resetting the start value");
962     IV->setOperand(0, VPV);
963   }
964 }
965 
966 /// Generate the code inside the preheader and body of the vectorized loop.
967 /// Assumes a single pre-header basic-block was created for this. Introduce
968 /// additional basic-blocks as needed, and fill them all.
969 void VPlan::execute(VPTransformState *State) {
970   // Set the reverse mapping from VPValues to Values for code generation.
971   for (auto &Entry : Value2VPValue)
972     State->VPValue2Value[Entry.second] = Entry.first;
973 
974   // Initialize CFG state.
975   State->CFG.PrevVPBB = nullptr;
976   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
977   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
978   State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
979 
980   // Generate code in the loop pre-header and body.
981   for (VPBlockBase *Block : depth_first(Entry))
982     Block->execute(State);
983 
984   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
985   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
986 
987   // Fix the latch value of canonical, reduction and first-order recurrences
988   // phis in the vector loop.
989   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
990   for (VPRecipeBase &R : Header->phis()) {
991     // Skip phi-like recipes that generate their backedege values themselves.
992     if (isa<VPWidenPHIRecipe>(&R))
993       continue;
994 
995     if (isa<VPWidenPointerInductionRecipe>(&R) ||
996         isa<VPWidenIntOrFpInductionRecipe>(&R)) {
997       PHINode *Phi = nullptr;
998       if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
999         Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
1000       } else {
1001         auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
1002         // TODO: Split off the case that all users of a pointer phi are scalar
1003         // from the VPWidenPointerInductionRecipe.
1004         if (WidenPhi->onlyScalarsGenerated(State->VF))
1005           continue;
1006 
1007         auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
1008         Phi = cast<PHINode>(GEP->getPointerOperand());
1009       }
1010 
1011       Phi->setIncomingBlock(1, VectorLatchBB);
1012 
1013       // Move the last step to the end of the latch block. This ensures
1014       // consistent placement of all induction updates.
1015       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
1016       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
1017       continue;
1018     }
1019 
1020     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
1021     // For  canonical IV, first-order recurrences and in-order reduction phis,
1022     // only a single part is generated, which provides the last part from the
1023     // previous iteration. For non-ordered reductions all UF parts are
1024     // generated.
1025     bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
1026                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
1027                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
1028     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
1029 
1030     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1031       Value *Phi = State->get(PhiR, Part);
1032       Value *Val = State->get(PhiR->getBackedgeValue(),
1033                               SinglePartNeeded ? State->UF - 1 : Part);
1034       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
1035     }
1036   }
1037 
1038   // We do not attempt to preserve DT for outer loop vectorization currently.
1039   if (!EnableVPlanNativePath) {
1040     BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
1041     State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
1042     updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
1043                         State->CFG.ExitBB);
1044   }
1045 }
1046 
1047 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1048 LLVM_DUMP_METHOD
1049 void VPlan::print(raw_ostream &O) const {
1050   VPSlotTracker SlotTracker(this);
1051 
1052   O << "VPlan '" << Name << "' {";
1053 
1054   if (VectorTripCount.getNumUsers() > 0) {
1055     O << "\nLive-in ";
1056     VectorTripCount.printAsOperand(O, SlotTracker);
1057     O << " = vector-trip-count\n";
1058   }
1059 
1060   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
1061     O << "\nLive-in ";
1062     BackedgeTakenCount->printAsOperand(O, SlotTracker);
1063     O << " = backedge-taken count\n";
1064   }
1065 
1066   for (const VPBlockBase *Block : depth_first(getEntry())) {
1067     O << '\n';
1068     Block->print(O, "", SlotTracker);
1069   }
1070 
1071   if (!LiveOuts.empty())
1072     O << "\n";
1073   for (auto &KV : LiveOuts) {
1074     O << "Live-out ";
1075     KV.second->getPhi()->printAsOperand(O);
1076     O << " = ";
1077     KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
1078     O << "\n";
1079   }
1080 
1081   O << "}\n";
1082 }
1083 
1084 LLVM_DUMP_METHOD
1085 void VPlan::printDOT(raw_ostream &O) const {
1086   VPlanPrinter Printer(O, *this);
1087   Printer.dump();
1088 }
1089 
1090 LLVM_DUMP_METHOD
1091 void VPlan::dump() const { print(dbgs()); }
1092 #endif
1093 
1094 void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
1095   assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
1096   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
1097 }
1098 
1099 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
1100                                 BasicBlock *LoopLatchBB,
1101                                 BasicBlock *LoopExitBB) {
1102   // The vector body may be more than a single basic-block by this point.
1103   // Update the dominator tree information inside the vector body by propagating
1104   // it from header to latch, expecting only triangular control-flow, if any.
1105   BasicBlock *PostDomSucc = nullptr;
1106   for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
1107     // Get the list of successors of this block.
1108     std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
1109     assert(Succs.size() <= 2 &&
1110            "Basic block in vector loop has more than 2 successors.");
1111     PostDomSucc = Succs[0];
1112     if (Succs.size() == 1) {
1113       assert(PostDomSucc->getSinglePredecessor() &&
1114              "PostDom successor has more than one predecessor.");
1115       DT->addNewBlock(PostDomSucc, BB);
1116       continue;
1117     }
1118     BasicBlock *InterimSucc = Succs[1];
1119     if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
1120       PostDomSucc = Succs[1];
1121       InterimSucc = Succs[0];
1122     }
1123     assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
1124            "One successor of a basic block does not lead to the other.");
1125     assert(InterimSucc->getSinglePredecessor() &&
1126            "Interim successor has more than one predecessor.");
1127     assert(PostDomSucc->hasNPredecessors(2) &&
1128            "PostDom successor has more than two predecessors.");
1129     DT->addNewBlock(InterimSucc, BB);
1130     DT->addNewBlock(PostDomSucc, BB);
1131   }
1132   // Latch block is a new dominator for the loop exit.
1133   DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
1134   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
1135 }
1136 
1137 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1138 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
1139   return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
1140          Twine(getOrCreateBID(Block));
1141 }
1142 
1143 Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
1144   const std::string &Name = Block->getName();
1145   if (!Name.empty())
1146     return Name;
1147   return "VPB" + Twine(getOrCreateBID(Block));
1148 }
1149 
1150 void VPlanPrinter::dump() {
1151   Depth = 1;
1152   bumpIndent(0);
1153   OS << "digraph VPlan {\n";
1154   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
1155   if (!Plan.getName().empty())
1156     OS << "\\n" << DOT::EscapeString(Plan.getName());
1157   if (Plan.BackedgeTakenCount) {
1158     OS << ", where:\\n";
1159     Plan.BackedgeTakenCount->print(OS, SlotTracker);
1160     OS << " := BackedgeTakenCount";
1161   }
1162   OS << "\"]\n";
1163   OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
1164   OS << "edge [fontname=Courier, fontsize=30]\n";
1165   OS << "compound=true\n";
1166 
1167   for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
1168     dumpBlock(Block);
1169 
1170   OS << "}\n";
1171 }
1172 
1173 void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
1174   if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
1175     dumpBasicBlock(BasicBlock);
1176   else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1177     dumpRegion(Region);
1178   else
1179     llvm_unreachable("Unsupported kind of VPBlock.");
1180 }
1181 
1182 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
1183                             bool Hidden, const Twine &Label) {
1184   // Due to "dot" we print an edge between two regions as an edge between the
1185   // exiting basic block and the entry basic of the respective regions.
1186   const VPBlockBase *Tail = From->getExitingBasicBlock();
1187   const VPBlockBase *Head = To->getEntryBasicBlock();
1188   OS << Indent << getUID(Tail) << " -> " << getUID(Head);
1189   OS << " [ label=\"" << Label << '\"';
1190   if (Tail != From)
1191     OS << " ltail=" << getUID(From);
1192   if (Head != To)
1193     OS << " lhead=" << getUID(To);
1194   if (Hidden)
1195     OS << "; splines=none";
1196   OS << "]\n";
1197 }
1198 
1199 void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
1200   auto &Successors = Block->getSuccessors();
1201   if (Successors.size() == 1)
1202     drawEdge(Block, Successors.front(), false, "");
1203   else if (Successors.size() == 2) {
1204     drawEdge(Block, Successors.front(), false, "T");
1205     drawEdge(Block, Successors.back(), false, "F");
1206   } else {
1207     unsigned SuccessorNumber = 0;
1208     for (auto *Successor : Successors)
1209       drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
1210   }
1211 }
1212 
1213 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
1214   // Implement dot-formatted dump by performing plain-text dump into the
1215   // temporary storage followed by some post-processing.
1216   OS << Indent << getUID(BasicBlock) << " [label =\n";
1217   bumpIndent(1);
1218   std::string Str;
1219   raw_string_ostream SS(Str);
1220   // Use no indentation as we need to wrap the lines into quotes ourselves.
1221   BasicBlock->print(SS, "", SlotTracker);
1222 
1223   // We need to process each line of the output separately, so split
1224   // single-string plain-text dump.
1225   SmallVector<StringRef, 0> Lines;
1226   StringRef(Str).rtrim('\n').split(Lines, "\n");
1227 
1228   auto EmitLine = [&](StringRef Line, StringRef Suffix) {
1229     OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
1230   };
1231 
1232   // Don't need the "+" after the last line.
1233   for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
1234     EmitLine(Line, " +\n");
1235   EmitLine(Lines.back(), "\n");
1236 
1237   bumpIndent(-1);
1238   OS << Indent << "]\n";
1239 
1240   dumpEdges(BasicBlock);
1241 }
1242 
1243 void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
1244   OS << Indent << "subgraph " << getUID(Region) << " {\n";
1245   bumpIndent(1);
1246   OS << Indent << "fontname=Courier\n"
1247      << Indent << "label=\""
1248      << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
1249      << DOT::EscapeString(Region->getName()) << "\"\n";
1250   // Dump the blocks of the region.
1251   assert(Region->getEntry() && "Region contains no inner blocks.");
1252   for (const VPBlockBase *Block : depth_first(Region->getEntry()))
1253     dumpBlock(Block);
1254   bumpIndent(-1);
1255   OS << Indent << "}\n";
1256   dumpEdges(Region);
1257 }
1258 
1259 void VPlanIngredient::print(raw_ostream &O) const {
1260   if (auto *Inst = dyn_cast<Instruction>(V)) {
1261     if (!Inst->getType()->isVoidTy()) {
1262       Inst->printAsOperand(O, false);
1263       O << " = ";
1264     }
1265     O << Inst->getOpcodeName() << " ";
1266     unsigned E = Inst->getNumOperands();
1267     if (E > 0) {
1268       Inst->getOperand(0)->printAsOperand(O, false);
1269       for (unsigned I = 1; I < E; ++I)
1270         Inst->getOperand(I)->printAsOperand(O << ", ", false);
1271     }
1272   } else // !Inst
1273     V->printAsOperand(O, false);
1274 }
1275 
1276 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1277                               VPSlotTracker &SlotTracker) const {
1278   O << Indent << "WIDEN-CALL ";
1279 
1280   auto *CI = cast<CallInst>(getUnderlyingInstr());
1281   if (CI->getType()->isVoidTy())
1282     O << "void ";
1283   else {
1284     printAsOperand(O, SlotTracker);
1285     O << " = ";
1286   }
1287 
1288   O << "call @" << CI->getCalledFunction()->getName() << "(";
1289   printOperands(O, SlotTracker);
1290   O << ")";
1291 }
1292 
1293 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1294                                 VPSlotTracker &SlotTracker) const {
1295   O << Indent << "WIDEN-SELECT ";
1296   printAsOperand(O, SlotTracker);
1297   O << " = select ";
1298   getOperand(0)->printAsOperand(O, SlotTracker);
1299   O << ", ";
1300   getOperand(1)->printAsOperand(O, SlotTracker);
1301   O << ", ";
1302   getOperand(2)->printAsOperand(O, SlotTracker);
1303   O << (InvariantCond ? " (condition is loop invariant)" : "");
1304 }
1305 
1306 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1307                           VPSlotTracker &SlotTracker) const {
1308   O << Indent << "WIDEN ";
1309   printAsOperand(O, SlotTracker);
1310   O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
1311   printOperands(O, SlotTracker);
1312 }
1313 
1314 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1315                                           VPSlotTracker &SlotTracker) const {
1316   O << Indent << "WIDEN-INDUCTION";
1317   if (getTruncInst()) {
1318     O << "\\l\"";
1319     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1320     O << " +\n" << Indent << "\"  ";
1321     getVPValue(0)->printAsOperand(O, SlotTracker);
1322   } else
1323     O << " " << VPlanIngredient(IV);
1324 
1325   O << ", ";
1326   getStepValue()->printAsOperand(O, SlotTracker);
1327 }
1328 
1329 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1330                                           VPSlotTracker &SlotTracker) const {
1331   O << Indent << "EMIT ";
1332   printAsOperand(O, SlotTracker);
1333   O << " = WIDEN-POINTER-INDUCTION ";
1334   getStartValue()->printAsOperand(O, SlotTracker);
1335   O << ", " << *IndDesc.getStep();
1336 }
1337 
1338 #endif
1339 
1340 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1341   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1342   auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
1343   return StartC && StartC->isZero() && StepC && StepC->isOne();
1344 }
1345 
1346 VPCanonicalIVPHIRecipe *VPScalarIVStepsRecipe::getCanonicalIV() const {
1347   return cast<VPCanonicalIVPHIRecipe>(getOperand(0));
1348 }
1349 
1350 bool VPScalarIVStepsRecipe::isCanonical() const {
1351   auto *CanIV = getCanonicalIV();
1352   // The start value of the steps-recipe must match the start value of the
1353   // canonical induction and it must step by 1.
1354   if (CanIV->getStartValue() != getStartValue())
1355     return false;
1356   auto *StepVPV = getStepValue();
1357   if (StepVPV->getDef())
1358     return false;
1359   auto *StepC = dyn_cast_or_null<ConstantInt>(StepVPV->getLiveInIRValue());
1360   return StepC && StepC->isOne();
1361 }
1362 
1363 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1364 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1365                                   VPSlotTracker &SlotTracker) const {
1366   O << Indent;
1367   printAsOperand(O, SlotTracker);
1368   O << Indent << "= SCALAR-STEPS ";
1369   printOperands(O, SlotTracker);
1370 }
1371 
1372 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1373                              VPSlotTracker &SlotTracker) const {
1374   O << Indent << "WIDEN-GEP ";
1375   O << (IsPtrLoopInvariant ? "Inv" : "Var");
1376   size_t IndicesNumber = IsIndexLoopInvariant.size();
1377   for (size_t I = 0; I < IndicesNumber; ++I)
1378     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
1379 
1380   O << " ";
1381   printAsOperand(O, SlotTracker);
1382   O << " = getelementptr ";
1383   printOperands(O, SlotTracker);
1384 }
1385 
1386 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1387                              VPSlotTracker &SlotTracker) const {
1388   O << Indent << "WIDEN-PHI ";
1389 
1390   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1391   // Unless all incoming values are modeled in VPlan  print the original PHI
1392   // directly.
1393   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1394   // values as VPValues.
1395   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1396     O << VPlanIngredient(OriginalPhi);
1397     return;
1398   }
1399 
1400   printAsOperand(O, SlotTracker);
1401   O << " = phi ";
1402   printOperands(O, SlotTracker);
1403 }
1404 
1405 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1406                           VPSlotTracker &SlotTracker) const {
1407   O << Indent << "BLEND ";
1408   Phi->printAsOperand(O, false);
1409   O << " =";
1410   if (getNumIncomingValues() == 1) {
1411     // Not a User of any mask: not really blending, this is a
1412     // single-predecessor phi.
1413     O << " ";
1414     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1415   } else {
1416     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1417       O << " ";
1418       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1419       O << "/";
1420       getMask(I)->printAsOperand(O, SlotTracker);
1421     }
1422   }
1423 }
1424 
1425 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1426                               VPSlotTracker &SlotTracker) const {
1427   O << Indent << "REDUCE ";
1428   printAsOperand(O, SlotTracker);
1429   O << " = ";
1430   getChainOp()->printAsOperand(O, SlotTracker);
1431   O << " +";
1432   if (isa<FPMathOperator>(getUnderlyingInstr()))
1433     O << getUnderlyingInstr()->getFastMathFlags();
1434   O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
1435   getVecOp()->printAsOperand(O, SlotTracker);
1436   if (getCondOp()) {
1437     O << ", ";
1438     getCondOp()->printAsOperand(O, SlotTracker);
1439   }
1440   O << ")";
1441   if (RdxDesc->IntermediateStore)
1442     O << " (with final reduction value stored in invariant address sank "
1443          "outside of loop)";
1444 }
1445 
1446 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1447                               VPSlotTracker &SlotTracker) const {
1448   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1449 
1450   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1451     printAsOperand(O, SlotTracker);
1452     O << " = ";
1453   }
1454   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1455     O << "call @" << CB->getCalledFunction()->getName() << "(";
1456     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1457                     O, [&O, &SlotTracker](VPValue *Op) {
1458                       Op->printAsOperand(O, SlotTracker);
1459                     });
1460     O << ")";
1461   } else {
1462     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
1463     printOperands(O, SlotTracker);
1464   }
1465 
1466   if (AlsoPack)
1467     O << " (S->V)";
1468 }
1469 
1470 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1471                                 VPSlotTracker &SlotTracker) const {
1472   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1473   printAsOperand(O, SlotTracker);
1474   O << " = ";
1475   printOperands(O, SlotTracker);
1476 }
1477 
1478 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1479                                            VPSlotTracker &SlotTracker) const {
1480   O << Indent << "WIDEN ";
1481 
1482   if (!isStore()) {
1483     getVPSingleValue()->printAsOperand(O, SlotTracker);
1484     O << " = ";
1485   }
1486   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1487 
1488   printOperands(O, SlotTracker);
1489 }
1490 #endif
1491 
1492 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1493   Value *Start = getStartValue()->getLiveInIRValue();
1494   PHINode *EntryPart = PHINode::Create(
1495       Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
1496 
1497   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1498   EntryPart->addIncoming(Start, VectorPH);
1499   EntryPart->setDebugLoc(DL);
1500   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1501     State.set(this, EntryPart, Part);
1502 }
1503 
1504 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1505 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1506                                    VPSlotTracker &SlotTracker) const {
1507   O << Indent << "EMIT ";
1508   printAsOperand(O, SlotTracker);
1509   O << " = CANONICAL-INDUCTION";
1510 }
1511 #endif
1512 
1513 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1514   bool IsUniform = vputils::onlyFirstLaneUsed(this);
1515   return all_of(users(),
1516                 [&](const VPUser *U) { return U->usesScalars(this); }) &&
1517          (IsUniform || !VF.isScalable());
1518 }
1519 
1520 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1521   assert(!State.Instance && "cannot be used in per-lane");
1522   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1523   SCEVExpander Exp(SE, DL, "induction");
1524 
1525   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1526                                  &*State.Builder.GetInsertPoint());
1527 
1528   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1529     State.set(this, Res, Part);
1530 }
1531 
1532 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1533 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1534                                VPSlotTracker &SlotTracker) const {
1535   O << Indent << "EMIT ";
1536   getVPSingleValue()->printAsOperand(O, SlotTracker);
1537   O << " = EXPAND SCEV " << *Expr;
1538 }
1539 #endif
1540 
1541 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1542   Value *CanonicalIV = State.get(getOperand(0), 0);
1543   Type *STy = CanonicalIV->getType();
1544   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1545   ElementCount VF = State.VF;
1546   Value *VStart = VF.isScalar()
1547                       ? CanonicalIV
1548                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1549   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1550     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1551     if (VF.isVector()) {
1552       VStep = Builder.CreateVectorSplat(VF, VStep);
1553       VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1554     }
1555     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1556     State.set(this, CanonicalVectorIV, Part);
1557   }
1558 }
1559 
1560 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1561 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1562                                      VPSlotTracker &SlotTracker) const {
1563   O << Indent << "EMIT ";
1564   printAsOperand(O, SlotTracker);
1565   O << " = WIDEN-CANONICAL-INDUCTION ";
1566   printOperands(O, SlotTracker);
1567 }
1568 #endif
1569 
1570 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1571   auto &Builder = State.Builder;
1572   // Create a vector from the initial value.
1573   auto *VectorInit = getStartValue()->getLiveInIRValue();
1574 
1575   Type *VecTy = State.VF.isScalar()
1576                     ? VectorInit->getType()
1577                     : VectorType::get(VectorInit->getType(), State.VF);
1578 
1579   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1580   if (State.VF.isVector()) {
1581     auto *IdxTy = Builder.getInt32Ty();
1582     auto *One = ConstantInt::get(IdxTy, 1);
1583     IRBuilder<>::InsertPointGuard Guard(Builder);
1584     Builder.SetInsertPoint(VectorPH->getTerminator());
1585     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1586     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1587     VectorInit = Builder.CreateInsertElement(
1588         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1589   }
1590 
1591   // Create a phi node for the new recurrence.
1592   PHINode *EntryPart = PHINode::Create(
1593       VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
1594   EntryPart->addIncoming(VectorInit, VectorPH);
1595   State.set(this, EntryPart, 0);
1596 }
1597 
1598 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1599 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1600                                             VPSlotTracker &SlotTracker) const {
1601   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1602   printAsOperand(O, SlotTracker);
1603   O << " = phi ";
1604   printOperands(O, SlotTracker);
1605 }
1606 #endif
1607 
1608 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1609   PHINode *PN = cast<PHINode>(getUnderlyingValue());
1610   auto &Builder = State.Builder;
1611 
1612   // In order to support recurrences we need to be able to vectorize Phi nodes.
1613   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1614   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1615   // this value when we vectorize all of the instructions that use the PHI.
1616   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1617   Type *VecTy =
1618       ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1619 
1620   BasicBlock *HeaderBB = State.CFG.PrevBB;
1621   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1622          "recipe must be in the vector loop header");
1623   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1624   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1625     Value *EntryPart =
1626         PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
1627     State.set(this, EntryPart, Part);
1628   }
1629 
1630   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1631 
1632   // Reductions do not have to start at zero. They can start with
1633   // any loop invariant values.
1634   VPValue *StartVPV = getStartValue();
1635   Value *StartV = StartVPV->getLiveInIRValue();
1636 
1637   Value *Iden = nullptr;
1638   RecurKind RK = RdxDesc.getRecurrenceKind();
1639   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1640       RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
1641     // MinMax reduction have the start value as their identify.
1642     if (ScalarPHI) {
1643       Iden = StartV;
1644     } else {
1645       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1646       Builder.SetInsertPoint(VectorPH->getTerminator());
1647       StartV = Iden =
1648           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1649     }
1650   } else {
1651     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1652                                          RdxDesc.getFastMathFlags());
1653 
1654     if (!ScalarPHI) {
1655       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1656       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1657       Builder.SetInsertPoint(VectorPH->getTerminator());
1658       Constant *Zero = Builder.getInt32(0);
1659       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1660     }
1661   }
1662 
1663   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1664     Value *EntryPart = State.get(this, Part);
1665     // Make sure to add the reduction start value only to the
1666     // first unroll part.
1667     Value *StartVal = (Part == 0) ? StartV : Iden;
1668     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1669   }
1670 }
1671 
1672 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1673 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1674                                  VPSlotTracker &SlotTracker) const {
1675   O << Indent << "WIDEN-REDUCTION-PHI ";
1676 
1677   printAsOperand(O, SlotTracker);
1678   O << " = phi ";
1679   printOperands(O, SlotTracker);
1680 }
1681 #endif
1682 
1683 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1684   assert(EnableVPlanNativePath &&
1685          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1686 
1687   // Currently we enter here in the VPlan-native path for non-induction
1688   // PHIs where all control flow is uniform. We simply widen these PHIs.
1689   // Create a vector phi with no operands - the vector phi operands will be
1690   // set at the end of vector code generation.
1691   VPBasicBlock *Parent = getParent();
1692   VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1693   unsigned StartIdx = 0;
1694   // For phis in header blocks of loop regions, use the index of the value
1695   // coming from the preheader.
1696   if (LoopRegion->getEntryBasicBlock() == Parent) {
1697     for (unsigned I = 0; I < getNumOperands(); ++I) {
1698       if (getIncomingBlock(I) ==
1699           LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1700         StartIdx = I;
1701     }
1702   }
1703   Value *Op0 = State.get(getOperand(StartIdx), 0);
1704   Type *VecTy = Op0->getType();
1705   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1706   State.set(this, VecPhi, 0);
1707 }
1708 
1709 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
1710 
1711 void VPValue::replaceAllUsesWith(VPValue *New) {
1712   for (unsigned J = 0; J < getNumUsers();) {
1713     VPUser *User = Users[J];
1714     unsigned NumUsers = getNumUsers();
1715     for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
1716       if (User->getOperand(I) == this)
1717         User->setOperand(I, New);
1718     // If a user got removed after updating the current user, the next user to
1719     // update will be moved to the current position, so we only need to
1720     // increment the index if the number of users did not change.
1721     if (NumUsers == getNumUsers())
1722       J++;
1723   }
1724 }
1725 
1726 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1727 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
1728   if (const Value *UV = getUnderlyingValue()) {
1729     OS << "ir<";
1730     UV->printAsOperand(OS, false);
1731     OS << ">";
1732     return;
1733   }
1734 
1735   unsigned Slot = Tracker.getSlot(this);
1736   if (Slot == unsigned(-1))
1737     OS << "<badref>";
1738   else
1739     OS << "vp<%" << Tracker.getSlot(this) << ">";
1740 }
1741 
1742 void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1743   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1744     Op->printAsOperand(O, SlotTracker);
1745   });
1746 }
1747 #endif
1748 
1749 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
1750                                           Old2NewTy &Old2New,
1751                                           InterleavedAccessInfo &IAI) {
1752   ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
1753   for (VPBlockBase *Base : RPOT) {
1754     visitBlock(Base, Old2New, IAI);
1755   }
1756 }
1757 
1758 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
1759                                          InterleavedAccessInfo &IAI) {
1760   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
1761     for (VPRecipeBase &VPI : *VPBB) {
1762       if (isa<VPHeaderPHIRecipe>(&VPI))
1763         continue;
1764       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
1765       auto *VPInst = cast<VPInstruction>(&VPI);
1766 
1767       auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
1768       if (!Inst)
1769         continue;
1770       auto *IG = IAI.getInterleaveGroup(Inst);
1771       if (!IG)
1772         continue;
1773 
1774       auto NewIGIter = Old2New.find(IG);
1775       if (NewIGIter == Old2New.end())
1776         Old2New[IG] = new InterleaveGroup<VPInstruction>(
1777             IG->getFactor(), IG->isReverse(), IG->getAlign());
1778 
1779       if (Inst == IG->getInsertPos())
1780         Old2New[IG]->setInsertPos(VPInst);
1781 
1782       InterleaveGroupMap[VPInst] = Old2New[IG];
1783       InterleaveGroupMap[VPInst]->insertMember(
1784           VPInst, IG->getIndex(Inst),
1785           Align(IG->isReverse() ? (-1) * int(IG->getFactor())
1786                                 : IG->getFactor()));
1787     }
1788   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1789     visitRegion(Region, Old2New, IAI);
1790   else
1791     llvm_unreachable("Unsupported kind of VPBlock.");
1792 }
1793 
1794 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
1795                                                  InterleavedAccessInfo &IAI) {
1796   Old2NewTy Old2New;
1797   visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
1798 }
1799 
1800 void VPSlotTracker::assignSlot(const VPValue *V) {
1801   assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
1802   Slots[V] = NextSlot++;
1803 }
1804 
1805 void VPSlotTracker::assignSlots(const VPlan &Plan) {
1806 
1807   for (const auto &P : Plan.VPExternalDefs)
1808     assignSlot(P.second);
1809 
1810   assignSlot(&Plan.VectorTripCount);
1811   if (Plan.BackedgeTakenCount)
1812     assignSlot(Plan.BackedgeTakenCount);
1813 
1814   ReversePostOrderTraversal<
1815       VPBlockRecursiveTraversalWrapper<const VPBlockBase *>>
1816       RPOT(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
1817           Plan.getEntry()));
1818   for (const VPBasicBlock *VPBB :
1819        VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
1820     for (const VPRecipeBase &Recipe : *VPBB)
1821       for (VPValue *Def : Recipe.definedValues())
1822         assignSlot(Def);
1823 }
1824 
1825 bool vputils::onlyFirstLaneUsed(VPValue *Def) {
1826   return all_of(Def->users(),
1827                 [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
1828 }
1829 
1830 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
1831                                                 ScalarEvolution &SE) {
1832   if (auto *E = dyn_cast<SCEVConstant>(Expr))
1833     return Plan.getOrAddExternalDef(E->getValue());
1834   if (auto *E = dyn_cast<SCEVUnknown>(Expr))
1835     return Plan.getOrAddExternalDef(E->getValue());
1836 
1837   VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
1838   VPValue *Step = new VPExpandSCEVRecipe(Expr, SE);
1839   Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef()));
1840   return Step;
1841 }
1842