xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (revision 2b55ef187cb6029eed43d7f4c0a3640c32691b31)
1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements a set of utility VPlan to VPlan transformations.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlanTransforms.h"
15 #include "VPRecipeBuilder.h"
16 #include "VPlan.h"
17 #include "VPlanAnalysis.h"
18 #include "VPlanCFG.h"
19 #include "VPlanDominatorTree.h"
20 #include "VPlanPatternMatch.h"
21 #include "VPlanUtils.h"
22 #include "VPlanVerifier.h"
23 #include "llvm/ADT/PostOrderIterator.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SetVector.h"
26 #include "llvm/ADT/TypeSwitch.h"
27 #include "llvm/Analysis/IVDescriptors.h"
28 #include "llvm/Analysis/VectorUtils.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/PatternMatch.h"
31 
32 using namespace llvm;
33 
34 void VPlanTransforms::VPInstructionsToVPRecipes(
35     VPlanPtr &Plan,
36     function_ref<const InductionDescriptor *(PHINode *)>
37         GetIntOrFpInductionDescriptor,
38     ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
39 
40   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
41       Plan->getVectorLoopRegion());
42   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
43     // Skip blocks outside region
44     if (!VPBB->getParent())
45       break;
46     VPRecipeBase *Term = VPBB->getTerminator();
47     auto EndIter = Term ? Term->getIterator() : VPBB->end();
48     // Introduce each ingredient into VPlan.
49     for (VPRecipeBase &Ingredient :
50          make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
51 
52       VPValue *VPV = Ingredient.getVPSingleValue();
53       Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue());
54 
55       VPRecipeBase *NewRecipe = nullptr;
56       if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
57         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
58         const auto *II = GetIntOrFpInductionDescriptor(Phi);
59         if (!II)
60           continue;
61 
62         VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
63         VPValue *Step =
64             vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
65         NewRecipe = new VPWidenIntOrFpInductionRecipe(
66             Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
67       } else {
68         assert(isa<VPInstruction>(&Ingredient) &&
69                "only VPInstructions expected here");
70         assert(!isa<PHINode>(Inst) && "phis should be handled above");
71         // Create VPWidenMemoryRecipe for loads and stores.
72         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
73           NewRecipe = new VPWidenLoadRecipe(
74               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
75               false /*Consecutive*/, false /*Reverse*/,
76               Ingredient.getDebugLoc());
77         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
78           NewRecipe = new VPWidenStoreRecipe(
79               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
80               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
81               Ingredient.getDebugLoc());
82         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
83           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
84         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
85           NewRecipe = new VPWidenIntrinsicRecipe(
86               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
87               {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
88               CI->getDebugLoc());
89         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
90           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
91         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
92           NewRecipe = new VPWidenCastRecipe(
93               CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
94         } else {
95           NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
96         }
97       }
98 
99       NewRecipe->insertBefore(&Ingredient);
100       if (NewRecipe->getNumDefinedValues() == 1)
101         VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
102       else
103         assert(NewRecipe->getNumDefinedValues() == 0 &&
104                "Only recpies with zero or one defined values expected");
105       Ingredient.eraseFromParent();
106     }
107   }
108 }
109 
110 static bool sinkScalarOperands(VPlan &Plan) {
111   auto Iter = vp_depth_first_deep(Plan.getEntry());
112   bool Changed = false;
113   // First, collect the operands of all recipes in replicate blocks as seeds for
114   // sinking.
115   SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
116   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
117     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
118     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
119       continue;
120     VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
121     if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
122       continue;
123     for (auto &Recipe : *VPBB) {
124       for (VPValue *Op : Recipe.operands())
125         if (auto *Def =
126                 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
127           WorkList.insert(std::make_pair(VPBB, Def));
128     }
129   }
130 
131   bool ScalarVFOnly = Plan.hasScalarVFOnly();
132   // Try to sink each replicate or scalar IV steps recipe in the worklist.
133   for (unsigned I = 0; I != WorkList.size(); ++I) {
134     VPBasicBlock *SinkTo;
135     VPSingleDefRecipe *SinkCandidate;
136     std::tie(SinkTo, SinkCandidate) = WorkList[I];
137     if (SinkCandidate->getParent() == SinkTo ||
138         SinkCandidate->mayHaveSideEffects() ||
139         SinkCandidate->mayReadOrWriteMemory())
140       continue;
141     if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
142       if (!ScalarVFOnly && RepR->isUniform())
143         continue;
144     } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
145       continue;
146 
147     bool NeedsDuplicating = false;
148     // All recipe users of the sink candidate must be in the same block SinkTo
149     // or all users outside of SinkTo must be uniform-after-vectorization (
150     // i.e., only first lane is used) . In the latter case, we need to duplicate
151     // SinkCandidate.
152     auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
153                             SinkCandidate](VPUser *U) {
154       auto *UI = cast<VPRecipeBase>(U);
155       if (UI->getParent() == SinkTo)
156         return true;
157       NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
158       // We only know how to duplicate VPRecipeRecipes for now.
159       return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate);
160     };
161     if (!all_of(SinkCandidate->users(), CanSinkWithUser))
162       continue;
163 
164     if (NeedsDuplicating) {
165       if (ScalarVFOnly)
166         continue;
167       Instruction *I = SinkCandidate->getUnderlyingInstr();
168       auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
169       // TODO: add ".cloned" suffix to name of Clone's VPValue.
170 
171       Clone->insertBefore(SinkCandidate);
172       SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
173         return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
174       });
175     }
176     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
177     for (VPValue *Op : SinkCandidate->operands())
178       if (auto *Def =
179               dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
180         WorkList.insert(std::make_pair(SinkTo, Def));
181     Changed = true;
182   }
183   return Changed;
184 }
185 
186 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
187 /// the mask.
188 VPValue *getPredicatedMask(VPRegionBlock *R) {
189   auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
190   if (!EntryBB || EntryBB->size() != 1 ||
191       !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
192     return nullptr;
193 
194   return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
195 }
196 
197 /// If \p R is a triangle region, return the 'then' block of the triangle.
198 static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
199   auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
200   if (EntryBB->getNumSuccessors() != 2)
201     return nullptr;
202 
203   auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
204   auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
205   if (!Succ0 || !Succ1)
206     return nullptr;
207 
208   if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
209     return nullptr;
210   if (Succ0->getSingleSuccessor() == Succ1)
211     return Succ0;
212   if (Succ1->getSingleSuccessor() == Succ0)
213     return Succ1;
214   return nullptr;
215 }
216 
217 // Merge replicate regions in their successor region, if a replicate region
218 // is connected to a successor replicate region with the same predicate by a
219 // single, empty VPBasicBlock.
220 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
221   SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
222 
223   // Collect replicate regions followed by an empty block, followed by another
224   // replicate region with matching masks to process front. This is to avoid
225   // iterator invalidation issues while merging regions.
226   SmallVector<VPRegionBlock *, 8> WorkList;
227   for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
228            vp_depth_first_deep(Plan.getEntry()))) {
229     if (!Region1->isReplicator())
230       continue;
231     auto *MiddleBasicBlock =
232         dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
233     if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
234       continue;
235 
236     auto *Region2 =
237         dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
238     if (!Region2 || !Region2->isReplicator())
239       continue;
240 
241     VPValue *Mask1 = getPredicatedMask(Region1);
242     VPValue *Mask2 = getPredicatedMask(Region2);
243     if (!Mask1 || Mask1 != Mask2)
244       continue;
245 
246     assert(Mask1 && Mask2 && "both region must have conditions");
247     WorkList.push_back(Region1);
248   }
249 
250   // Move recipes from Region1 to its successor region, if both are triangles.
251   for (VPRegionBlock *Region1 : WorkList) {
252     if (TransformedRegions.contains(Region1))
253       continue;
254     auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
255     auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
256 
257     VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
258     VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
259     if (!Then1 || !Then2)
260       continue;
261 
262     // Note: No fusion-preventing memory dependencies are expected in either
263     // region. Such dependencies should be rejected during earlier dependence
264     // checks, which guarantee accesses can be re-ordered for vectorization.
265     //
266     // Move recipes to the successor region.
267     for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
268       ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
269 
270     auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
271     auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
272 
273     // Move VPPredInstPHIRecipes from the merge block to the successor region's
274     // merge block. Update all users inside the successor region to use the
275     // original values.
276     for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
277       VPValue *PredInst1 =
278           cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
279       VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
280       Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
281         return cast<VPRecipeBase>(&U)->getParent() == Then2;
282       });
283 
284       // Remove phi recipes that are unused after merging the regions.
285       if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
286         Phi1ToMove.eraseFromParent();
287         continue;
288       }
289       Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
290     }
291 
292     // Finally, remove the first region.
293     for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
294       VPBlockUtils::disconnectBlocks(Pred, Region1);
295       VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
296     }
297     VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
298     TransformedRegions.insert(Region1);
299   }
300 
301   return !TransformedRegions.empty();
302 }
303 
304 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
305                                             VPlan &Plan) {
306   Instruction *Instr = PredRecipe->getUnderlyingInstr();
307   // Build the triangular if-then region.
308   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
309   assert(Instr->getParent() && "Predicated instruction not in any basic block");
310   auto *BlockInMask = PredRecipe->getMask();
311   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
312   auto *Entry =
313       Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
314 
315   // Replace predicated replicate recipe with a replicate recipe without a
316   // mask but in the replicate region.
317   auto *RecipeWithoutMask = new VPReplicateRecipe(
318       PredRecipe->getUnderlyingInstr(),
319       make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
320       PredRecipe->isUniform());
321   auto *Pred =
322       Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
323 
324   VPPredInstPHIRecipe *PHIRecipe = nullptr;
325   if (PredRecipe->getNumUsers() != 0) {
326     PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
327                                         RecipeWithoutMask->getDebugLoc());
328     PredRecipe->replaceAllUsesWith(PHIRecipe);
329     PHIRecipe->setOperand(0, RecipeWithoutMask);
330   }
331   PredRecipe->eraseFromParent();
332   auto *Exiting =
333       Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
334   VPRegionBlock *Region =
335       Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
336 
337   // Note: first set Entry as region entry and then connect successors starting
338   // from it in order, to propagate the "parent" of each VPBasicBlock.
339   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
340   VPBlockUtils::connectBlocks(Pred, Exiting);
341 
342   return Region;
343 }
344 
345 static void addReplicateRegions(VPlan &Plan) {
346   SmallVector<VPReplicateRecipe *> WorkList;
347   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
348            vp_depth_first_deep(Plan.getEntry()))) {
349     for (VPRecipeBase &R : *VPBB)
350       if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
351         if (RepR->isPredicated())
352           WorkList.push_back(RepR);
353       }
354   }
355 
356   unsigned BBNum = 0;
357   for (VPReplicateRecipe *RepR : WorkList) {
358     VPBasicBlock *CurrentBlock = RepR->getParent();
359     VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
360 
361     BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
362     SplitBlock->setName(
363         OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
364     // Record predicated instructions for above packing optimizations.
365     VPBlockBase *Region = createReplicateRegion(RepR, Plan);
366     Region->setParent(CurrentBlock->getParent());
367     VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region);
368   }
369 }
370 
371 /// Remove redundant VPBasicBlocks by merging them into their predecessor if
372 /// the predecessor has a single successor.
373 static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
374   SmallVector<VPBasicBlock *> WorkList;
375   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
376            vp_depth_first_deep(Plan.getEntry()))) {
377     // Don't fold the blocks in the skeleton of the Plan into their single
378     // predecessors for now.
379     // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
380     if (!VPBB->getParent())
381       continue;
382     auto *PredVPBB =
383         dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
384     if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
385         isa<VPIRBasicBlock>(PredVPBB))
386       continue;
387     WorkList.push_back(VPBB);
388   }
389 
390   for (VPBasicBlock *VPBB : WorkList) {
391     VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
392     for (VPRecipeBase &R : make_early_inc_range(*VPBB))
393       R.moveBefore(*PredVPBB, PredVPBB->end());
394     VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
395     auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent());
396     if (ParentRegion && ParentRegion->getExiting() == VPBB)
397       ParentRegion->setExiting(PredVPBB);
398     for (auto *Succ : to_vector(VPBB->successors())) {
399       VPBlockUtils::disconnectBlocks(VPBB, Succ);
400       VPBlockUtils::connectBlocks(PredVPBB, Succ);
401     }
402     // VPBB is now dead and will be cleaned up when the plan gets destroyed.
403   }
404   return !WorkList.empty();
405 }
406 
407 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
408   // Convert masked VPReplicateRecipes to if-then region blocks.
409   addReplicateRegions(Plan);
410 
411   bool ShouldSimplify = true;
412   while (ShouldSimplify) {
413     ShouldSimplify = sinkScalarOperands(Plan);
414     ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
415     ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
416   }
417 }
418 
419 /// Remove redundant casts of inductions.
420 ///
421 /// Such redundant casts are casts of induction variables that can be ignored,
422 /// because we already proved that the casted phi is equal to the uncasted phi
423 /// in the vectorized loop. There is no need to vectorize the cast - the same
424 /// value can be used for both the phi and casts in the vector loop.
425 static void removeRedundantInductionCasts(VPlan &Plan) {
426   for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
427     auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
428     if (!IV || IV->getTruncInst())
429       continue;
430 
431     // A sequence of IR Casts has potentially been recorded for IV, which
432     // *must be bypassed* when the IV is vectorized, because the vectorized IV
433     // will produce the desired casted value. This sequence forms a def-use
434     // chain and is provided in reverse order, ending with the cast that uses
435     // the IV phi. Search for the recipe of the last cast in the chain and
436     // replace it with the original IV. Note that only the final cast is
437     // expected to have users outside the cast-chain and the dead casts left
438     // over will be cleaned up later.
439     auto &Casts = IV->getInductionDescriptor().getCastInsts();
440     VPValue *FindMyCast = IV;
441     for (Instruction *IRCast : reverse(Casts)) {
442       VPSingleDefRecipe *FoundUserCast = nullptr;
443       for (auto *U : FindMyCast->users()) {
444         auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
445         if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
446           FoundUserCast = UserCast;
447           break;
448         }
449       }
450       FindMyCast = FoundUserCast;
451     }
452     FindMyCast->replaceAllUsesWith(IV);
453   }
454 }
455 
456 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
457 /// recipe, if it exists.
458 static void removeRedundantCanonicalIVs(VPlan &Plan) {
459   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
460   VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
461   for (VPUser *U : CanonicalIV->users()) {
462     WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
463     if (WidenNewIV)
464       break;
465   }
466 
467   if (!WidenNewIV)
468     return;
469 
470   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
471   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
472     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
473 
474     if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
475       continue;
476 
477     // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
478     // everything WidenNewIV's users need. That is, WidenOriginalIV will
479     // generate a vector phi or all users of WidenNewIV demand the first lane
480     // only.
481     if (any_of(WidenOriginalIV->users(),
482                [WidenOriginalIV](VPUser *U) {
483                  return !U->usesScalars(WidenOriginalIV);
484                }) ||
485         vputils::onlyFirstLaneUsed(WidenNewIV)) {
486       WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
487       WidenNewIV->eraseFromParent();
488       return;
489     }
490   }
491 }
492 
493 /// Returns true if \p R is dead and can be removed.
494 static bool isDeadRecipe(VPRecipeBase &R) {
495   using namespace llvm::PatternMatch;
496   // Do remove conditional assume instructions as their conditions may be
497   // flattened.
498   auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
499   bool IsConditionalAssume =
500       RepR && RepR->isPredicated() &&
501       match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
502   if (IsConditionalAssume)
503     return true;
504 
505   if (R.mayHaveSideEffects())
506     return false;
507 
508   // Recipe is dead if no user keeps the recipe alive.
509   return all_of(R.definedValues(),
510                 [](VPValue *V) { return V->getNumUsers() == 0; });
511 }
512 
513 void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
514   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
515       Plan.getEntry());
516 
517   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
518     // The recipes in the block are processed in reverse order, to catch chains
519     // of dead recipes.
520     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
521       if (isDeadRecipe(R))
522         R.eraseFromParent();
523     }
524   }
525 }
526 
527 static VPScalarIVStepsRecipe *
528 createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
529                     Instruction::BinaryOps InductionOpcode,
530                     FPMathOperator *FPBinOp, Instruction *TruncI,
531                     VPValue *StartV, VPValue *Step, DebugLoc DL,
532                     VPBuilder &Builder) {
533   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
534   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
535   VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
536       Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
537 
538   // Truncate base induction if needed.
539   Type *CanonicalIVType = CanonicalIV->getScalarType();
540   VPTypeAnalysis TypeInfo(CanonicalIVType);
541   Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
542   if (TruncI) {
543     Type *TruncTy = TruncI->getType();
544     assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
545            "Not truncating.");
546     assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
547     BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
548     ResultTy = TruncTy;
549   }
550 
551   // Truncate step if needed.
552   Type *StepTy = TypeInfo.inferScalarType(Step);
553   if (ResultTy != StepTy) {
554     assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
555            "Not truncating.");
556     assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
557     auto *VecPreheader =
558         cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor());
559     VPBuilder::InsertPointGuard Guard(Builder);
560     Builder.setInsertPoint(VecPreheader);
561     Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
562   }
563   return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step);
564 }
565 
566 static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
567   SetVector<VPUser *> Users(V->user_begin(), V->user_end());
568   for (unsigned I = 0; I != Users.size(); ++I) {
569     VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]);
570     if (isa<VPHeaderPHIRecipe>(Cur))
571       continue;
572     for (VPValue *V : Cur->definedValues())
573       Users.insert(V->user_begin(), V->user_end());
574   }
575   return Users.takeVector();
576 }
577 
578 /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
579 /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
580 /// VPWidenPointerInductionRecipe will generate vectors only. If some users
581 /// require vectors while other require scalars, the scalar uses need to extract
582 /// the scalars from the generated vectors (Note that this is different to how
583 /// int/fp inductions are handled). Legalize extract-from-ends using uniform
584 /// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
585 /// the correct end value is available. Also optimize
586 /// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
587 /// providing them scalar steps built on the canonical scalar IV and update the
588 /// original IV's users. This is an optional optimization to reduce the needs of
589 /// vector extracts.
590 static void legalizeAndOptimizeInductions(VPlan &Plan) {
591   using namespace llvm::VPlanPatternMatch;
592   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
593   bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
594   VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
595   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
596     auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
597     if (!PhiR)
598       continue;
599 
600     // Try to narrow wide and replicating recipes to uniform recipes, based on
601     // VPlan analysis.
602     // TODO: Apply to all recipes in the future, to replace legacy uniformity
603     // analysis.
604     auto Users = collectUsersRecursively(PhiR);
605     for (VPUser *U : reverse(Users)) {
606       auto *Def = dyn_cast<VPSingleDefRecipe>(U);
607       auto *RepR = dyn_cast<VPReplicateRecipe>(U);
608       // Skip recipes that shouldn't be narrowed.
609       if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
610           Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
611           (RepR && (RepR->isUniform() || RepR->isPredicated())))
612         continue;
613 
614       // Skip recipes that may have other lanes than their first used.
615       if (!vputils::isUniformAfterVectorization(Def) &&
616           !vputils::onlyFirstLaneUsed(Def))
617         continue;
618 
619       auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
620                                           Def->operands(), /*IsUniform*/ true);
621       Clone->insertAfter(Def);
622       Def->replaceAllUsesWith(Clone);
623     }
624 
625     // Replace wide pointer inductions which have only their scalars used by
626     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
627     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
628       if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
629         continue;
630 
631       const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
632       VPValue *StartV =
633           Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
634       VPValue *StepV = PtrIV->getOperand(1);
635       VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
636           Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
637           nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
638 
639       VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
640                                              PtrIV->getDebugLoc(), "next.gep");
641 
642       PtrIV->replaceAllUsesWith(PtrAdd);
643       continue;
644     }
645 
646     // Replace widened induction with scalar steps for users that only use
647     // scalars.
648     auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
649     if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
650           return U->usesScalars(WideIV);
651         }))
652       continue;
653 
654     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
655     VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
656         Plan, ID.getKind(), ID.getInductionOpcode(),
657         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
658         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
659         WideIV->getDebugLoc(), Builder);
660 
661     // Update scalar users of IV to use Step instead.
662     if (!HasOnlyVectorVFs)
663       WideIV->replaceAllUsesWith(Steps);
664     else
665       WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
666         return U.usesScalars(WideIV);
667       });
668   }
669 }
670 
671 /// Check if \p VPV is an untruncated wide induction, either before or after the
672 /// increment. If so return the header IV (before the increment), otherwise
673 /// return null.
674 static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
675   auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
676   if (WideIV) {
677     // VPV itself is a wide induction, separately compute the end value for exit
678     // users if it is not a truncated IV.
679     auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
680     return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
681   }
682 
683   // Check if VPV is an optimizable induction increment.
684   VPRecipeBase *Def = VPV->getDefiningRecipe();
685   if (!Def || Def->getNumOperands() != 2)
686     return nullptr;
687   WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
688   if (!WideIV)
689     WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
690   if (!WideIV)
691     return nullptr;
692 
693   auto IsWideIVInc = [&]() {
694     using namespace VPlanPatternMatch;
695     auto &ID = WideIV->getInductionDescriptor();
696 
697     // Check if VPV increments the induction by the induction step.
698     VPValue *IVStep = WideIV->getStepValue();
699     switch (ID.getInductionOpcode()) {
700     case Instruction::Add:
701       return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
702                                                      m_Specific(IVStep)));
703     case Instruction::FAdd:
704       return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
705                                                       m_Specific(IVStep)));
706     case Instruction::FSub:
707       return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
708                                                     m_Specific(IVStep)));
709     case Instruction::Sub: {
710       // IVStep will be the negated step of the subtraction. Check if Step == -1
711       // * IVStep.
712       VPValue *Step;
713       if (!match(VPV,
714                  m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
715           !Step->isLiveIn() || !IVStep->isLiveIn())
716         return false;
717       auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
718       auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
719       return StepCI && IVStepCI &&
720              StepCI->getValue() == (-1 * IVStepCI->getValue());
721     }
722     default:
723       return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
724              match(VPV, m_GetElementPtr(m_Specific(WideIV),
725                                         m_Specific(WideIV->getStepValue())));
726     }
727     llvm_unreachable("should have been covered by switch above");
728   };
729   return IsWideIVInc() ? WideIV : nullptr;
730 }
731 
732 void VPlanTransforms::optimizeInductionExitUsers(
733     VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
734   using namespace VPlanPatternMatch;
735   SmallVector<VPIRBasicBlock *> ExitVPBBs(Plan.getExitBlocks());
736   if (ExitVPBBs.size() != 1)
737     return;
738 
739   VPIRBasicBlock *ExitVPBB = ExitVPBBs[0];
740   VPBlockBase *PredVPBB = ExitVPBB->getSinglePredecessor();
741   if (!PredVPBB)
742     return;
743   assert(PredVPBB == Plan.getMiddleBlock() &&
744          "predecessor must be the middle block");
745 
746   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
747   VPBuilder B(Plan.getMiddleBlock()->getTerminator());
748   for (VPRecipeBase &R : *ExitVPBB) {
749     auto *ExitIRI = cast<VPIRInstruction>(&R);
750     if (!isa<PHINode>(ExitIRI->getInstruction()))
751       break;
752 
753     VPValue *Incoming;
754     if (!match(ExitIRI->getOperand(0),
755                m_VPInstruction<VPInstruction::ExtractFromEnd>(
756                    m_VPValue(Incoming), m_SpecificInt(1))))
757       continue;
758 
759     auto *WideIV = getOptimizableIVOf(Incoming);
760     if (!WideIV)
761       continue;
762     VPValue *EndValue = EndValues.lookup(WideIV);
763     assert(EndValue && "end value must have been pre-computed");
764 
765     if (Incoming != WideIV) {
766       ExitIRI->setOperand(0, EndValue);
767       continue;
768     }
769 
770     VPValue *Escape = nullptr;
771     VPValue *Step = WideIV->getStepValue();
772     Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
773     if (ScalarTy->isIntegerTy()) {
774       Escape =
775           B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
776     } else if (ScalarTy->isPointerTy()) {
777       auto *Zero = Plan.getOrAddLiveIn(
778           ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
779       Escape = B.createPtrAdd(EndValue,
780                               B.createNaryOp(Instruction::Sub, {Zero, Step}),
781                               {}, "ind.escape");
782     } else if (ScalarTy->isFloatingPointTy()) {
783       const auto &ID = WideIV->getInductionDescriptor();
784       Escape = B.createNaryOp(
785           ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
786               ? Instruction::FSub
787               : Instruction::FAdd,
788           {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
789     } else {
790       llvm_unreachable("all possible induction types must be handled");
791     }
792     ExitIRI->setOperand(0, Escape);
793   }
794 }
795 
796 /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
797 /// them with already existing recipes expanding the same SCEV expression.
798 static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
799   DenseMap<const SCEV *, VPValue *> SCEV2VPV;
800 
801   for (VPRecipeBase &R :
802        make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) {
803     auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
804     if (!ExpR)
805       continue;
806 
807     auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR});
808     if (I.second)
809       continue;
810     ExpR->replaceAllUsesWith(I.first->second);
811     ExpR->eraseFromParent();
812   }
813 }
814 
815 static void recursivelyDeleteDeadRecipes(VPValue *V) {
816   SmallVector<VPValue *> WorkList;
817   SmallPtrSet<VPValue *, 8> Seen;
818   WorkList.push_back(V);
819 
820   while (!WorkList.empty()) {
821     VPValue *Cur = WorkList.pop_back_val();
822     if (!Seen.insert(Cur).second)
823       continue;
824     VPRecipeBase *R = Cur->getDefiningRecipe();
825     if (!R)
826       continue;
827     if (!isDeadRecipe(*R))
828       continue;
829     WorkList.append(R->op_begin(), R->op_end());
830     R->eraseFromParent();
831   }
832 }
833 
834 /// Try to simplify recipe \p R.
835 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
836   using namespace llvm::VPlanPatternMatch;
837 
838   if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
839     // Try to remove redundant blend recipes.
840     SmallPtrSet<VPValue *, 4> UniqueValues;
841     if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
842       UniqueValues.insert(Blend->getIncomingValue(0));
843     for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
844       if (!match(Blend->getMask(I), m_False()))
845         UniqueValues.insert(Blend->getIncomingValue(I));
846 
847     if (UniqueValues.size() == 1) {
848       Blend->replaceAllUsesWith(*UniqueValues.begin());
849       Blend->eraseFromParent();
850       return;
851     }
852 
853     if (Blend->isNormalized())
854       return;
855 
856     // Normalize the blend so its first incoming value is used as the initial
857     // value with the others blended into it.
858 
859     unsigned StartIndex = 0;
860     for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
861       // If a value's mask is used only by the blend then is can be deadcoded.
862       // TODO: Find the most expensive mask that can be deadcoded, or a mask
863       // that's used by multiple blends where it can be removed from them all.
864       VPValue *Mask = Blend->getMask(I);
865       if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
866         StartIndex = I;
867         break;
868       }
869     }
870 
871     SmallVector<VPValue *, 4> OperandsWithMask;
872     OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
873 
874     for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
875       if (I == StartIndex)
876         continue;
877       OperandsWithMask.push_back(Blend->getIncomingValue(I));
878       OperandsWithMask.push_back(Blend->getMask(I));
879     }
880 
881     auto *NewBlend = new VPBlendRecipe(
882         cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
883     NewBlend->insertBefore(&R);
884 
885     VPValue *DeadMask = Blend->getMask(StartIndex);
886     Blend->replaceAllUsesWith(NewBlend);
887     Blend->eraseFromParent();
888     recursivelyDeleteDeadRecipes(DeadMask);
889     return;
890   }
891 
892   VPValue *A;
893   if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
894     VPValue *Trunc = R.getVPSingleValue();
895     Type *TruncTy = TypeInfo.inferScalarType(Trunc);
896     Type *ATy = TypeInfo.inferScalarType(A);
897     if (TruncTy == ATy) {
898       Trunc->replaceAllUsesWith(A);
899     } else {
900       // Don't replace a scalarizing recipe with a widened cast.
901       if (isa<VPReplicateRecipe>(&R))
902         return;
903       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
904 
905         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
906                                  ? Instruction::SExt
907                                  : Instruction::ZExt;
908         auto *VPC =
909             new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
910         if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
911           // UnderlyingExt has distinct return type, used to retain legacy cost.
912           VPC->setUnderlyingValue(UnderlyingExt);
913         }
914         VPC->insertBefore(&R);
915         Trunc->replaceAllUsesWith(VPC);
916       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
917         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
918         VPC->insertBefore(&R);
919         Trunc->replaceAllUsesWith(VPC);
920       }
921     }
922 #ifndef NDEBUG
923     // Verify that the cached type info is for both A and its users is still
924     // accurate by comparing it to freshly computed types.
925     VPTypeAnalysis TypeInfo2(
926         R.getParent()->getPlan()->getCanonicalIV()->getScalarType());
927     assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
928     for (VPUser *U : A->users()) {
929       auto *R = cast<VPRecipeBase>(U);
930       for (VPValue *VPV : R->definedValues())
931         assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
932     }
933 #endif
934   }
935 
936   // Simplify (X && Y) || (X && !Y) -> X.
937   // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
938   // && (Y || Z) and (X || !X) into true. This requires queuing newly created
939   // recipes to be visited during simplification.
940   VPValue *X, *Y, *X1, *Y1;
941   if (match(&R,
942             m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
943                          m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
944       X == X1 && Y == Y1) {
945     R.getVPSingleValue()->replaceAllUsesWith(X);
946     R.eraseFromParent();
947     return;
948   }
949 
950   if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
951     return R.getVPSingleValue()->replaceAllUsesWith(A);
952 
953   if (match(&R, m_Not(m_Not(m_VPValue(A)))))
954     return R.getVPSingleValue()->replaceAllUsesWith(A);
955 
956   // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
957   if ((match(&R,
958              m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) ||
959        match(&R,
960              m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) &&
961       TypeInfo.inferScalarType(R.getOperand(1)) ==
962           TypeInfo.inferScalarType(R.getVPSingleValue()))
963     return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
964 }
965 
966 /// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all
967 /// un-typed live-ins in VPTypeAnalysis.
968 static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
969   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
970       Plan.getEntry());
971   VPTypeAnalysis TypeInfo(&CanonicalIVTy);
972   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
973     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
974       simplifyRecipe(R, TypeInfo);
975     }
976   }
977 }
978 
979 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
980                                          unsigned BestUF,
981                                          PredicatedScalarEvolution &PSE) {
982   assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
983   assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
984   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
985   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
986   auto *Term = &ExitingVPBB->back();
987   // Try to simplify the branch condition if TC <= VF * UF when preparing to
988   // execute the plan for the main vector loop. We only do this if the
989   // terminator is:
990   //  1. BranchOnCount, or
991   //  2. BranchOnCond where the input is Not(ActiveLaneMask).
992   using namespace llvm::VPlanPatternMatch;
993   if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
994       !match(Term,
995              m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
996     return;
997 
998   ScalarEvolution &SE = *PSE.getSE();
999   const SCEV *TripCount =
1000       vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1001   assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1002          "Trip count SCEV must be computable");
1003   ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1004   const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1005   if (TripCount->isZero() ||
1006       !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1007     return;
1008 
1009   // The vector loop region only executes once. If possible, completely remove
1010   // the region, otherwise replace the terminator controlling the latch with
1011   // (BranchOnCond true).
1012   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1013   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1014   if (all_of(
1015           Header->phis(),
1016           IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1017     for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1018       auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
1019       HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
1020       HeaderPhiR->eraseFromParent();
1021     }
1022 
1023     VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1024     VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1025     VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1026     VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1027 
1028     for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1029       B->setParent(nullptr);
1030 
1031     VPBlockUtils::connectBlocks(Preheader, Header);
1032     VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1033     simplifyRecipes(Plan, *CanIVTy);
1034   } else {
1035     // The vector region contains header phis for which we cannot remove the
1036     // loop region yet.
1037     LLVMContext &Ctx = SE.getContext();
1038     auto *BOC = new VPInstruction(
1039         VPInstruction::BranchOnCond,
1040         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
1041     ExitingVPBB->appendRecipe(BOC);
1042   }
1043 
1044   Term->eraseFromParent();
1045   VPlanTransforms::removeDeadRecipes(Plan);
1046 
1047   Plan.setVF(BestVF);
1048   Plan.setUF(BestUF);
1049   // TODO: Further simplifications are possible
1050   //      1. Replace inductions with constants.
1051   //      2. Replace vector loop region with VPBasicBlock.
1052 }
1053 
1054 /// Sink users of \p FOR after the recipe defining the previous value \p
1055 /// Previous of the recurrence. \returns true if all users of \p FOR could be
1056 /// re-arranged as needed or false if it is not possible.
1057 static bool
1058 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1059                                  VPRecipeBase *Previous,
1060                                  VPDominatorTree &VPDT) {
1061   // Collect recipes that need sinking.
1062   SmallVector<VPRecipeBase *> WorkList;
1063   SmallPtrSet<VPRecipeBase *, 8> Seen;
1064   Seen.insert(Previous);
1065   auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1066     // The previous value must not depend on the users of the recurrence phi. In
1067     // that case, FOR is not a fixed order recurrence.
1068     if (SinkCandidate == Previous)
1069       return false;
1070 
1071     if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1072         !Seen.insert(SinkCandidate).second ||
1073         VPDT.properlyDominates(Previous, SinkCandidate))
1074       return true;
1075 
1076     if (SinkCandidate->mayHaveSideEffects())
1077       return false;
1078 
1079     WorkList.push_back(SinkCandidate);
1080     return true;
1081   };
1082 
1083   // Recursively sink users of FOR after Previous.
1084   WorkList.push_back(FOR);
1085   for (unsigned I = 0; I != WorkList.size(); ++I) {
1086     VPRecipeBase *Current = WorkList[I];
1087     assert(Current->getNumDefinedValues() == 1 &&
1088            "only recipes with a single defined value expected");
1089 
1090     for (VPUser *User : Current->getVPSingleValue()->users()) {
1091       if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1092         return false;
1093     }
1094   }
1095 
1096   // Keep recipes to sink ordered by dominance so earlier instructions are
1097   // processed first.
1098   sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1099     return VPDT.properlyDominates(A, B);
1100   });
1101 
1102   for (VPRecipeBase *SinkCandidate : WorkList) {
1103     if (SinkCandidate == FOR)
1104       continue;
1105 
1106     SinkCandidate->moveAfter(Previous);
1107     Previous = SinkCandidate;
1108   }
1109   return true;
1110 }
1111 
1112 /// Try to hoist \p Previous and its operands before all users of \p FOR.
1113 static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1114                                         VPRecipeBase *Previous,
1115                                         VPDominatorTree &VPDT) {
1116   if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1117     return false;
1118 
1119   // Collect recipes that need hoisting.
1120   SmallVector<VPRecipeBase *> HoistCandidates;
1121   SmallPtrSet<VPRecipeBase *, 8> Visited;
1122   VPRecipeBase *HoistPoint = nullptr;
1123   // Find the closest hoist point by looking at all users of FOR and selecting
1124   // the recipe dominating all other users.
1125   for (VPUser *U : FOR->users()) {
1126     auto *R = cast<VPRecipeBase>(U);
1127     if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1128       HoistPoint = R;
1129   }
1130   assert(all_of(FOR->users(),
1131                 [&VPDT, HoistPoint](VPUser *U) {
1132                   auto *R = cast<VPRecipeBase>(U);
1133                   return HoistPoint == R ||
1134                          VPDT.properlyDominates(HoistPoint, R);
1135                 }) &&
1136          "HoistPoint must dominate all users of FOR");
1137 
1138   auto NeedsHoisting = [HoistPoint, &VPDT,
1139                         &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1140     VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1141     if (!HoistCandidate)
1142       return nullptr;
1143     VPRegionBlock *EnclosingLoopRegion =
1144         HoistCandidate->getParent()->getEnclosingLoopRegion();
1145     assert((!HoistCandidate->getParent()->getParent() ||
1146             HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1147            "CFG in VPlan should still be flat, without replicate regions");
1148     // Hoist candidate was already visited, no need to hoist.
1149     if (!Visited.insert(HoistCandidate).second)
1150       return nullptr;
1151 
1152     // Candidate is outside loop region or a header phi, dominates FOR users w/o
1153     // hoisting.
1154     if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1155       return nullptr;
1156 
1157     // If we reached a recipe that dominates HoistPoint, we don't need to
1158     // hoist the recipe.
1159     if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1160       return nullptr;
1161     return HoistCandidate;
1162   };
1163   auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1164     // Avoid hoisting candidates with side-effects, as we do not yet analyze
1165     // associated dependencies.
1166     return !HoistCandidate->mayHaveSideEffects();
1167   };
1168 
1169   if (!NeedsHoisting(Previous->getVPSingleValue()))
1170     return true;
1171 
1172   // Recursively try to hoist Previous and its operands before all users of FOR.
1173   HoistCandidates.push_back(Previous);
1174 
1175   for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1176     VPRecipeBase *Current = HoistCandidates[I];
1177     assert(Current->getNumDefinedValues() == 1 &&
1178            "only recipes with a single defined value expected");
1179     if (!CanHoist(Current))
1180       return false;
1181 
1182     for (VPValue *Op : Current->operands()) {
1183       // If we reach FOR, it means the original Previous depends on some other
1184       // recurrence that in turn depends on FOR. If that is the case, we would
1185       // also need to hoist recipes involving the other FOR, which may break
1186       // dependencies.
1187       if (Op == FOR)
1188         return false;
1189 
1190       if (auto *R = NeedsHoisting(Op))
1191         HoistCandidates.push_back(R);
1192     }
1193   }
1194 
1195   // Order recipes to hoist by dominance so earlier instructions are processed
1196   // first.
1197   sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1198     return VPDT.properlyDominates(A, B);
1199   });
1200 
1201   for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1202     HoistCandidate->moveBefore(*HoistPoint->getParent(),
1203                                HoistPoint->getIterator());
1204   }
1205 
1206   return true;
1207 }
1208 
1209 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1210                                                   VPBuilder &LoopBuilder) {
1211   VPDominatorTree VPDT;
1212   VPDT.recalculate(Plan);
1213 
1214   SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1215   for (VPRecipeBase &R :
1216        Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1217     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
1218       RecurrencePhis.push_back(FOR);
1219 
1220   for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1221     SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
1222     VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1223     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1224     // to terminate.
1225     while (auto *PrevPhi =
1226                dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) {
1227       assert(PrevPhi->getParent() == FOR->getParent());
1228       assert(SeenPhis.insert(PrevPhi).second);
1229       Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1230     }
1231 
1232     if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1233         !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1234       return false;
1235 
1236     // Introduce a recipe to combine the incoming and previous values of a
1237     // fixed-order recurrence.
1238     VPBasicBlock *InsertBlock = Previous->getParent();
1239     if (isa<VPHeaderPHIRecipe>(Previous))
1240       LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
1241     else
1242       LoopBuilder.setInsertPoint(InsertBlock,
1243                                  std::next(Previous->getIterator()));
1244 
1245     auto *RecurSplice = cast<VPInstruction>(
1246         LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
1247                                  {FOR, FOR->getBackedgeValue()}));
1248 
1249     FOR->replaceAllUsesWith(RecurSplice);
1250     // Set the first operand of RecurSplice to FOR again, after replacing
1251     // all users.
1252     RecurSplice->setOperand(0, FOR);
1253   }
1254   return true;
1255 }
1256 
1257 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1258   for (VPRecipeBase &R :
1259        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1260     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
1261     if (!PhiR)
1262       continue;
1263     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
1264     RecurKind RK = RdxDesc.getRecurrenceKind();
1265     if (RK != RecurKind::Add && RK != RecurKind::Mul)
1266       continue;
1267 
1268     for (VPUser *U : collectUsersRecursively(PhiR))
1269       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
1270         RecWithFlags->dropPoisonGeneratingFlags();
1271       }
1272   }
1273 }
1274 
1275 /// Move loop-invariant recipes out of the vector loop region in \p Plan.
1276 static void licm(VPlan &Plan) {
1277   VPBasicBlock *Preheader = Plan.getVectorPreheader();
1278 
1279   // Return true if we do not know how to (mechanically) hoist a given recipe
1280   // out of a loop region. Does not address legality concerns such as aliasing
1281   // or speculation safety.
1282   auto CannotHoistRecipe = [](VPRecipeBase &R) {
1283     // Allocas cannot be hoisted.
1284     auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1285     return RepR && RepR->getOpcode() == Instruction::Alloca;
1286   };
1287 
1288   // Hoist any loop invariant recipes from the vector loop region to the
1289   // preheader. Preform a shallow traversal of the vector loop region, to
1290   // exclude recipes in replicate regions.
1291   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1292   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1293            vp_depth_first_shallow(LoopRegion->getEntry()))) {
1294     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1295       if (CannotHoistRecipe(R))
1296         continue;
1297       // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1298       // their memory location is not modified in the vector loop.
1299       if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1300           any_of(R.operands(), [](VPValue *Op) {
1301             return !Op->isDefinedOutsideLoopRegions();
1302           }))
1303         continue;
1304       R.moveBefore(*Preheader, Preheader->end());
1305     }
1306   }
1307 }
1308 
1309 void VPlanTransforms::truncateToMinimalBitwidths(
1310     VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1311 #ifndef NDEBUG
1312   // Count the processed recipes and cross check the count later with MinBWs
1313   // size, to make sure all entries in MinBWs have been handled.
1314   unsigned NumProcessedRecipes = 0;
1315 #endif
1316   // Keep track of created truncates, so they can be re-used. Note that we
1317   // cannot use RAUW after creating a new truncate, as this would could make
1318   // other uses have different types for their operands, making them invalidly
1319   // typed.
1320   DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1321   Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1322   VPTypeAnalysis TypeInfo(CanonicalIVType);
1323   VPBasicBlock *PH = Plan.getVectorPreheader();
1324   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1325            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
1326     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1327       if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1328                VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
1329         continue;
1330 
1331       VPValue *ResultVPV = R.getVPSingleValue();
1332       auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
1333       unsigned NewResSizeInBits = MinBWs.lookup(UI);
1334       if (!NewResSizeInBits)
1335         continue;
1336 
1337 #ifndef NDEBUG
1338       NumProcessedRecipes++;
1339 #endif
1340       // If the value wasn't vectorized, we must maintain the original scalar
1341       // type. Skip those here, after incrementing NumProcessedRecipes. Also
1342       // skip casts which do not need to be handled explicitly here, as
1343       // redundant casts will be removed during recipe simplification.
1344       if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) {
1345 #ifndef NDEBUG
1346         // If any of the operands is a live-in and not used by VPWidenRecipe or
1347         // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as
1348         // processed as well. When MinBWs is currently constructed, there is no
1349         // information about whether recipes are widened or replicated and in
1350         // case they are reciplicated the operands are not truncated. Counting
1351         // them them here ensures we do not miss any recipes in MinBWs.
1352         // TODO: Remove once the analysis is done on VPlan.
1353         for (VPValue *Op : R.operands()) {
1354           if (!Op->isLiveIn())
1355             continue;
1356           auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue());
1357           if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) &&
1358               none_of(Op->users(),
1359                       IsaPred<VPWidenRecipe, VPWidenSelectRecipe>)) {
1360             // Add an entry to ProcessedTruncs to avoid counting the same
1361             // operand multiple times.
1362             ProcessedTruncs[Op] = nullptr;
1363             NumProcessedRecipes += 1;
1364           }
1365         }
1366 #endif
1367         continue;
1368       }
1369 
1370       Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
1371       unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1372       assert(OldResTy->isIntegerTy() && "only integer types supported");
1373       (void)OldResSizeInBits;
1374 
1375       LLVMContext &Ctx = CanonicalIVType->getContext();
1376       auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits);
1377 
1378       // Any wrapping introduced by shrinking this operation shouldn't be
1379       // considered undefined behavior. So, we can't unconditionally copy
1380       // arithmetic wrapping flags to VPW.
1381       if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
1382         VPW->dropPoisonGeneratingFlags();
1383 
1384       using namespace llvm::VPlanPatternMatch;
1385       if (OldResSizeInBits != NewResSizeInBits &&
1386           !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) {
1387         // Extend result to original width.
1388         auto *Ext =
1389             new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1390         Ext->insertAfter(&R);
1391         ResultVPV->replaceAllUsesWith(Ext);
1392         Ext->setOperand(0, ResultVPV);
1393         assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1394       } else {
1395         assert(
1396             match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1397             "Only ICmps should not need extending the result.");
1398       }
1399 
1400       assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1401       if (isa<VPWidenLoadRecipe>(&R))
1402         continue;
1403 
1404       // Shrink operands by introducing truncates as needed.
1405       unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
1406       for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1407         auto *Op = R.getOperand(Idx);
1408         unsigned OpSizeInBits =
1409             TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
1410         if (OpSizeInBits == NewResSizeInBits)
1411           continue;
1412         assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1413         auto [ProcessedIter, IterIsEmpty] =
1414             ProcessedTruncs.insert({Op, nullptr});
1415         VPWidenCastRecipe *NewOp =
1416             IterIsEmpty
1417                 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1418                 : ProcessedIter->second;
1419         R.setOperand(Idx, NewOp);
1420         if (!IterIsEmpty)
1421           continue;
1422         ProcessedIter->second = NewOp;
1423         if (!Op->isLiveIn()) {
1424           NewOp->insertBefore(&R);
1425         } else {
1426           PH->appendRecipe(NewOp);
1427 #ifndef NDEBUG
1428           auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue());
1429           bool IsContained = MinBWs.contains(OpInst);
1430           NumProcessedRecipes += IsContained;
1431 #endif
1432         }
1433       }
1434 
1435     }
1436   }
1437 
1438   assert(MinBWs.size() == NumProcessedRecipes &&
1439          "some entries in MinBWs haven't been processed");
1440 }
1441 
1442 void VPlanTransforms::optimize(VPlan &Plan) {
1443   runPass(removeRedundantCanonicalIVs, Plan);
1444   runPass(removeRedundantInductionCasts, Plan);
1445 
1446   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1447   runPass(removeDeadRecipes, Plan);
1448   runPass(legalizeAndOptimizeInductions, Plan);
1449   runPass(removeRedundantExpandSCEVRecipes, Plan);
1450   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
1451   runPass(removeDeadRecipes, Plan);
1452 
1453   runPass(createAndOptimizeReplicateRegions, Plan);
1454   runPass(mergeBlocksIntoPredecessors, Plan);
1455   runPass(licm, Plan);
1456 }
1457 
1458 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1459 // the loop terminator with a branch-on-cond recipe with the negated
1460 // active-lane-mask as operand. Note that this turns the loop into an
1461 // uncountable one. Only the existing terminator is replaced, all other existing
1462 // recipes/users remain unchanged, except for poison-generating flags being
1463 // dropped from the canonical IV increment. Return the created
1464 // VPActiveLaneMaskPHIRecipe.
1465 //
1466 // The function uses the following definitions:
1467 //
1468 //  %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1469 //    calculate-trip-count-minus-VF (original TC) : original TC
1470 //  %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1471 //     CanonicalIVPhi : CanonicalIVIncrement
1472 //  %StartV is the canonical induction start value.
1473 //
1474 // The function adds the following recipes:
1475 //
1476 // vector.ph:
1477 //   %TripCount = calculate-trip-count-minus-VF (original TC)
1478 //       [if DataWithControlFlowWithoutRuntimeCheck]
1479 //   %EntryInc = canonical-iv-increment-for-part %StartV
1480 //   %EntryALM = active-lane-mask %EntryInc, %TripCount
1481 //
1482 // vector.body:
1483 //   ...
1484 //   %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1485 //   ...
1486 //   %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1487 //   %ALM = active-lane-mask %InLoopInc, TripCount
1488 //   %Negated = Not %ALM
1489 //   branch-on-cond %Negated
1490 //
1491 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1492     VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1493   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1494   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1495   auto *CanonicalIVPHI = Plan.getCanonicalIV();
1496   VPValue *StartV = CanonicalIVPHI->getStartValue();
1497 
1498   auto *CanonicalIVIncrement =
1499       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1500   // TODO: Check if dropping the flags is needed if
1501   // !DataAndControlFlowWithoutRuntimeCheck.
1502   CanonicalIVIncrement->dropPoisonGeneratingFlags();
1503   DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1504   // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1505   // we have to take unrolling into account. Each part needs to start at
1506   //   Part * VF
1507   auto *VecPreheader = Plan.getVectorPreheader();
1508   VPBuilder Builder(VecPreheader);
1509 
1510   // Create the ActiveLaneMask instruction using the correct start values.
1511   VPValue *TC = Plan.getTripCount();
1512 
1513   VPValue *TripCount, *IncrementValue;
1514   if (!DataAndControlFlowWithoutRuntimeCheck) {
1515     // When the loop is guarded by a runtime overflow check for the loop
1516     // induction variable increment by VF, we can increment the value before
1517     // the get.active.lane mask and use the unmodified tripcount.
1518     IncrementValue = CanonicalIVIncrement;
1519     TripCount = TC;
1520   } else {
1521     // When avoiding a runtime check, the active.lane.mask inside the loop
1522     // uses a modified trip count and the induction variable increment is
1523     // done after the active.lane.mask intrinsic is called.
1524     IncrementValue = CanonicalIVPHI;
1525     TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
1526                                      {TC}, DL);
1527   }
1528   auto *EntryIncrement = Builder.createOverflowingOp(
1529       VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
1530       "index.part.next");
1531 
1532   // Create the active lane mask instruction in the VPlan preheader.
1533   auto *EntryALM =
1534       Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
1535                            DL, "active.lane.mask.entry");
1536 
1537   // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1538   // preheader ActiveLaneMask instruction.
1539   auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
1540   LaneMaskPhi->insertAfter(CanonicalIVPHI);
1541 
1542   // Create the active lane mask for the next iteration of the loop before the
1543   // original terminator.
1544   VPRecipeBase *OriginalTerminator = EB->getTerminator();
1545   Builder.setInsertPoint(OriginalTerminator);
1546   auto *InLoopIncrement =
1547       Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
1548                                   {IncrementValue}, {false, false}, DL);
1549   auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
1550                                    {InLoopIncrement, TripCount}, DL,
1551                                    "active.lane.mask.next");
1552   LaneMaskPhi->addOperand(ALM);
1553 
1554   // Replace the original terminator with BranchOnCond. We have to invert the
1555   // mask here because a true condition means jumping to the exit block.
1556   auto *NotMask = Builder.createNot(ALM, DL);
1557   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
1558   OriginalTerminator->eraseFromParent();
1559   return LaneMaskPhi;
1560 }
1561 
1562 /// Collect all VPValues representing a header mask through the (ICMP_ULE,
1563 /// WideCanonicalIV, backedge-taken-count) pattern.
1564 /// TODO: Introduce explicit recipe for header-mask instead of searching
1565 /// for the header-mask pattern manually.
1566 static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
1567   SmallVector<VPValue *> WideCanonicalIVs;
1568   auto *FoundWidenCanonicalIVUser =
1569       find_if(Plan.getCanonicalIV()->users(),
1570               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
1571   assert(count_if(Plan.getCanonicalIV()->users(),
1572                   [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
1573              1 &&
1574          "Must have at most one VPWideCanonicalIVRecipe");
1575   if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
1576     auto *WideCanonicalIV =
1577         cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
1578     WideCanonicalIVs.push_back(WideCanonicalIV);
1579   }
1580 
1581   // Also include VPWidenIntOrFpInductionRecipes that represent a widened
1582   // version of the canonical induction.
1583   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1584   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1585     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1586     if (WidenOriginalIV && WidenOriginalIV->isCanonical())
1587       WideCanonicalIVs.push_back(WidenOriginalIV);
1588   }
1589 
1590   // Walk users of wide canonical IVs and collect to all compares of the form
1591   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
1592   SmallVector<VPValue *> HeaderMasks;
1593   for (auto *Wide : WideCanonicalIVs) {
1594     for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
1595       auto *HeaderMask = dyn_cast<VPInstruction>(U);
1596       if (!HeaderMask || !vputils::isHeaderMask(HeaderMask, Plan))
1597         continue;
1598 
1599       assert(HeaderMask->getOperand(0) == Wide &&
1600              "WidenCanonicalIV must be the first operand of the compare");
1601       HeaderMasks.push_back(HeaderMask);
1602     }
1603   }
1604   return HeaderMasks;
1605 }
1606 
1607 void VPlanTransforms::addActiveLaneMask(
1608     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
1609     bool DataAndControlFlowWithoutRuntimeCheck) {
1610   assert((!DataAndControlFlowWithoutRuntimeCheck ||
1611           UseActiveLaneMaskForControlFlow) &&
1612          "DataAndControlFlowWithoutRuntimeCheck implies "
1613          "UseActiveLaneMaskForControlFlow");
1614 
1615   auto *FoundWidenCanonicalIVUser =
1616       find_if(Plan.getCanonicalIV()->users(),
1617               [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
1618   assert(FoundWidenCanonicalIVUser &&
1619          "Must have widened canonical IV when tail folding!");
1620   auto *WideCanonicalIV =
1621       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
1622   VPSingleDefRecipe *LaneMask;
1623   if (UseActiveLaneMaskForControlFlow) {
1624     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
1625         Plan, DataAndControlFlowWithoutRuntimeCheck);
1626   } else {
1627     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
1628     LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
1629                               {WideCanonicalIV, Plan.getTripCount()}, nullptr,
1630                               "active.lane.mask");
1631   }
1632 
1633   // Walk users of WideCanonicalIV and replace all compares of the form
1634   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
1635   // active-lane-mask.
1636   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
1637     HeaderMask->replaceAllUsesWith(LaneMask);
1638 }
1639 
1640 /// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
1641 /// nullptr if no EVL-based recipe could be created.
1642 /// \p HeaderMask  Header Mask.
1643 /// \p CurRecipe   Recipe to be transform.
1644 /// \p TypeInfo    VPlan-based type analysis.
1645 /// \p AllOneMask  The vector mask parameter of vector-predication intrinsics.
1646 /// \p EVL         The explicit vector length parameter of vector-predication
1647 /// intrinsics.
1648 static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
1649                                      VPRecipeBase &CurRecipe,
1650                                      VPTypeAnalysis &TypeInfo,
1651                                      VPValue &AllOneMask, VPValue &EVL) {
1652   using namespace llvm::VPlanPatternMatch;
1653   auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
1654     assert(OrigMask && "Unmasked recipe when folding tail");
1655     return HeaderMask == OrigMask ? nullptr : OrigMask;
1656   };
1657 
1658   return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
1659       .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
1660         VPValue *NewMask = GetNewMask(L->getMask());
1661         return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
1662       })
1663       .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
1664         VPValue *NewMask = GetNewMask(S->getMask());
1665         return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
1666       })
1667       .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
1668         unsigned Opcode = W->getOpcode();
1669         if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
1670           return nullptr;
1671         return new VPWidenEVLRecipe(*W, EVL);
1672       })
1673       .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
1674         VPValue *NewMask = GetNewMask(Red->getCondOp());
1675         return new VPReductionEVLRecipe(*Red, EVL, NewMask);
1676       })
1677       .Case<VPWidenIntrinsicRecipe, VPWidenCastRecipe>(
1678           [&](auto *CR) -> VPRecipeBase * {
1679             Intrinsic::ID VPID;
1680             if (auto *CallR = dyn_cast<VPWidenIntrinsicRecipe>(CR)) {
1681               VPID =
1682                   VPIntrinsic::getForIntrinsic(CallR->getVectorIntrinsicID());
1683             } else {
1684               auto *CastR = cast<VPWidenCastRecipe>(CR);
1685               VPID = VPIntrinsic::getForOpcode(CastR->getOpcode());
1686             }
1687 
1688             // Not all intrinsics have a corresponding VP intrinsic.
1689             if (VPID == Intrinsic::not_intrinsic)
1690               return nullptr;
1691             assert(VPIntrinsic::getMaskParamPos(VPID) &&
1692                    VPIntrinsic::getVectorLengthParamPos(VPID) &&
1693                    "Expected VP intrinsic to have mask and EVL");
1694 
1695             SmallVector<VPValue *> Ops(CR->operands());
1696             Ops.push_back(&AllOneMask);
1697             Ops.push_back(&EVL);
1698             return new VPWidenIntrinsicRecipe(
1699                 VPID, Ops, TypeInfo.inferScalarType(CR), CR->getDebugLoc());
1700           })
1701       .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
1702         SmallVector<VPValue *> Ops(Sel->operands());
1703         Ops.push_back(&EVL);
1704         return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
1705                                           TypeInfo.inferScalarType(Sel),
1706                                           Sel->getDebugLoc());
1707       })
1708       .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
1709         VPValue *LHS, *RHS;
1710         // Transform select with a header mask condition
1711         //   select(header_mask, LHS, RHS)
1712         // into vector predication merge.
1713         //   vp.merge(all-true, LHS, RHS, EVL)
1714         if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
1715                                  m_VPValue(RHS))))
1716           return nullptr;
1717         // Use all true as the condition because this transformation is
1718         // limited to selects whose condition is a header mask.
1719         return new VPWidenIntrinsicRecipe(
1720             Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
1721             TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
1722       })
1723       .Default([&](VPRecipeBase *R) { return nullptr; });
1724 }
1725 
1726 /// Replace recipes with their EVL variants.
1727 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
1728   Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1729   VPTypeAnalysis TypeInfo(CanonicalIVType);
1730   LLVMContext &Ctx = CanonicalIVType->getContext();
1731   VPValue *AllOneMask = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
1732 
1733   for (VPUser *U : to_vector(Plan.getVF().users())) {
1734     if (auto *R = dyn_cast<VPReverseVectorPointerRecipe>(U))
1735       R->setOperand(1, &EVL);
1736   }
1737 
1738   SmallVector<VPRecipeBase *> ToErase;
1739 
1740   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
1741     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
1742       auto *CurRecipe = cast<VPRecipeBase>(U);
1743       VPRecipeBase *EVLRecipe =
1744           createEVLRecipe(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
1745       if (!EVLRecipe)
1746         continue;
1747 
1748       [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
1749       assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
1750              "New recipe must define the same number of values as the "
1751              "original.");
1752       assert(
1753           NumDefVal <= 1 &&
1754           "Only supports recipes with a single definition or without users.");
1755       EVLRecipe->insertBefore(CurRecipe);
1756       if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
1757         VPValue *CurVPV = CurRecipe->getVPSingleValue();
1758         CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
1759       }
1760       // Defer erasing recipes till the end so that we don't invalidate the
1761       // VPTypeAnalysis cache.
1762       ToErase.push_back(CurRecipe);
1763     }
1764   }
1765 
1766   for (VPRecipeBase *R : reverse(ToErase)) {
1767     SmallVector<VPValue *> PossiblyDead(R->operands());
1768     R->eraseFromParent();
1769     for (VPValue *Op : PossiblyDead)
1770       recursivelyDeleteDeadRecipes(Op);
1771   }
1772 }
1773 
1774 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
1775 /// replaces all uses except the canonical IV increment of
1776 /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
1777 /// is used only for loop iterations counting after this transformation.
1778 ///
1779 /// The function uses the following definitions:
1780 ///  %StartV is the canonical induction start value.
1781 ///
1782 /// The function adds the following recipes:
1783 ///
1784 /// vector.ph:
1785 /// ...
1786 ///
1787 /// vector.body:
1788 /// ...
1789 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1790 ///                                               [ %NextEVLIV, %vector.body ]
1791 /// %AVL = sub original TC, %EVLPhi
1792 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
1793 /// ...
1794 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
1795 /// ...
1796 ///
1797 /// If MaxSafeElements is provided, the function adds the following recipes:
1798 /// vector.ph:
1799 /// ...
1800 ///
1801 /// vector.body:
1802 /// ...
1803 /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1804 ///                                               [ %NextEVLIV, %vector.body ]
1805 /// %AVL = sub original TC, %EVLPhi
1806 /// %cmp = cmp ult %AVL, MaxSafeElements
1807 /// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
1808 /// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
1809 /// ...
1810 /// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
1811 /// ...
1812 ///
1813 bool VPlanTransforms::tryAddExplicitVectorLength(
1814     VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
1815   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1816   // The transform updates all users of inductions to work based on EVL, instead
1817   // of the VF directly. At the moment, widened inductions cannot be updated, so
1818   // bail out if the plan contains any.
1819   bool ContainsWidenInductions = any_of(
1820       Header->phis(),
1821       IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>);
1822   if (ContainsWidenInductions)
1823     return false;
1824 
1825   auto *CanonicalIVPHI = Plan.getCanonicalIV();
1826   VPValue *StartV = CanonicalIVPHI->getStartValue();
1827 
1828   // Create the ExplicitVectorLengthPhi recipe in the main loop.
1829   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
1830   EVLPhi->insertAfter(CanonicalIVPHI);
1831   VPBuilder Builder(Header, Header->getFirstNonPhi());
1832   // Compute original TC - IV as the AVL (application vector length).
1833   VPValue *AVL = Builder.createNaryOp(
1834       Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
1835   if (MaxSafeElements) {
1836     // Support for MaxSafeDist for correct loop emission.
1837     VPValue *AVLSafe = Plan.getOrAddLiveIn(
1838         ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements));
1839     VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
1840     AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
1841   }
1842   auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
1843                                      DebugLoc());
1844 
1845   auto *CanonicalIVIncrement =
1846       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
1847   VPSingleDefRecipe *OpVPEVL = VPEVL;
1848   if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
1849       IVSize != 32) {
1850     OpVPEVL = new VPScalarCastRecipe(
1851         IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL,
1852         CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc());
1853     OpVPEVL->insertBefore(CanonicalIVIncrement);
1854   }
1855   auto *NextEVLIV =
1856       new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
1857                         {CanonicalIVIncrement->hasNoUnsignedWrap(),
1858                          CanonicalIVIncrement->hasNoSignedWrap()},
1859                         CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
1860   NextEVLIV->insertBefore(CanonicalIVIncrement);
1861   EVLPhi->addOperand(NextEVLIV);
1862 
1863   transformRecipestoEVLRecipes(Plan, *VPEVL);
1864 
1865   // Replace all uses of VPCanonicalIVPHIRecipe by
1866   // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
1867   CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
1868   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
1869   // TODO: support unroll factor > 1.
1870   Plan.setUF(1);
1871   return true;
1872 }
1873 
1874 void VPlanTransforms::dropPoisonGeneratingRecipes(
1875     VPlan &Plan,
1876     const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
1877   // Collect recipes in the backward slice of `Root` that may generate a poison
1878   // value that is used after vectorization.
1879   SmallPtrSet<VPRecipeBase *, 16> Visited;
1880   auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1881     SmallVector<VPRecipeBase *, 16> Worklist;
1882     Worklist.push_back(Root);
1883 
1884     // Traverse the backward slice of Root through its use-def chain.
1885     while (!Worklist.empty()) {
1886       VPRecipeBase *CurRec = Worklist.pop_back_val();
1887 
1888       if (!Visited.insert(CurRec).second)
1889         continue;
1890 
1891       // Prune search if we find another recipe generating a widen memory
1892       // instruction. Widen memory instructions involved in address computation
1893       // will lead to gather/scatter instructions, which don't need to be
1894       // handled.
1895       if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
1896               VPHeaderPHIRecipe>(CurRec))
1897         continue;
1898 
1899       // This recipe contributes to the address computation of a widen
1900       // load/store. If the underlying instruction has poison-generating flags,
1901       // drop them directly.
1902       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1903         VPValue *A, *B;
1904         using namespace llvm::VPlanPatternMatch;
1905         // Dropping disjoint from an OR may yield incorrect results, as some
1906         // analysis may have converted it to an Add implicitly (e.g. SCEV used
1907         // for dependence analysis). Instead, replace it with an equivalent Add.
1908         // This is possible as all users of the disjoint OR only access lanes
1909         // where the operands are disjoint or poison otherwise.
1910         if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
1911             RecWithFlags->isDisjoint()) {
1912           VPBuilder Builder(RecWithFlags);
1913           VPInstruction *New = Builder.createOverflowingOp(
1914               Instruction::Add, {A, B}, {false, false},
1915               RecWithFlags->getDebugLoc());
1916           New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
1917           RecWithFlags->replaceAllUsesWith(New);
1918           RecWithFlags->eraseFromParent();
1919           CurRec = New;
1920         } else
1921           RecWithFlags->dropPoisonGeneratingFlags();
1922       } else {
1923         Instruction *Instr = dyn_cast_or_null<Instruction>(
1924             CurRec->getVPSingleValue()->getUnderlyingValue());
1925         (void)Instr;
1926         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1927                "found instruction with poison generating flags not covered by "
1928                "VPRecipeWithIRFlags");
1929       }
1930 
1931       // Add new definitions to the worklist.
1932       for (VPValue *Operand : CurRec->operands())
1933         if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
1934           Worklist.push_back(OpDef);
1935     }
1936   });
1937 
1938   // Traverse all the recipes in the VPlan and collect the poison-generating
1939   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1940   // VPInterleaveRecipe.
1941   auto Iter = vp_depth_first_deep(Plan.getEntry());
1942   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1943     for (VPRecipeBase &Recipe : *VPBB) {
1944       if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
1945         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1946         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1947         if (AddrDef && WidenRec->isConsecutive() &&
1948             BlockNeedsPredication(UnderlyingInstr.getParent()))
1949           CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1950       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1951         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1952         if (AddrDef) {
1953           // Check if any member of the interleave group needs predication.
1954           const InterleaveGroup<Instruction> *InterGroup =
1955               InterleaveRec->getInterleaveGroup();
1956           bool NeedPredication = false;
1957           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1958                I < NumMembers; ++I) {
1959             Instruction *Member = InterGroup->getMember(I);
1960             if (Member)
1961               NeedPredication |= BlockNeedsPredication(Member->getParent());
1962           }
1963 
1964           if (NeedPredication)
1965             CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1966         }
1967       }
1968     }
1969   }
1970 }
1971 
1972 void VPlanTransforms::createInterleaveGroups(
1973     VPlan &Plan,
1974     const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
1975         &InterleaveGroups,
1976     VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
1977   if (InterleaveGroups.empty())
1978     return;
1979 
1980   // Interleave memory: for each Interleave Group we marked earlier as relevant
1981   // for this VPlan, replace the Recipes widening its memory instructions with a
1982   // single VPInterleaveRecipe at its insertion point.
1983   VPDominatorTree VPDT;
1984   VPDT.recalculate(Plan);
1985   for (const auto *IG : InterleaveGroups) {
1986     SmallVector<VPValue *, 4> StoredValues;
1987     for (unsigned i = 0; i < IG->getFactor(); ++i)
1988       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
1989         auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
1990         StoredValues.push_back(StoreR->getStoredValue());
1991       }
1992 
1993     bool NeedsMaskForGaps =
1994         IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
1995 
1996     Instruction *IRInsertPos = IG->getInsertPos();
1997     auto *InsertPos =
1998         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
1999 
2000     // Get or create the start address for the interleave group.
2001     auto *Start =
2002         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
2003     VPValue *Addr = Start->getAddr();
2004     VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2005     if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
2006       // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2007       // InsertPos or sink loads above zero members to join it.
2008       bool InBounds = false;
2009       if (auto *Gep = dyn_cast<GetElementPtrInst>(
2010               getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
2011         InBounds = Gep->isInBounds();
2012 
2013       // We cannot re-use the address of member zero because it does not
2014       // dominate the insert position. Instead, use the address of the insert
2015       // position and create a PtrAdd adjusting it to the address of member
2016       // zero.
2017       assert(IG->getIndex(IRInsertPos) != 0 &&
2018              "index of insert position shouldn't be zero");
2019       auto &DL = IRInsertPos->getDataLayout();
2020       APInt Offset(32,
2021                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
2022                        IG->getIndex(IRInsertPos),
2023                    /*IsSigned=*/true);
2024       VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2025           ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
2026       VPBuilder B(InsertPos);
2027       Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
2028                       : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
2029     }
2030     auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2031                                         InsertPos->getMask(), NeedsMaskForGaps);
2032     VPIG->insertBefore(InsertPos);
2033 
2034     unsigned J = 0;
2035     for (unsigned i = 0; i < IG->getFactor(); ++i)
2036       if (Instruction *Member = IG->getMember(i)) {
2037         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
2038         if (!Member->getType()->isVoidTy()) {
2039           VPValue *OriginalV = MemberR->getVPSingleValue();
2040           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
2041           J++;
2042         }
2043         MemberR->eraseFromParent();
2044       }
2045   }
2046 }
2047 
2048 void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
2049   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2050            vp_depth_first_deep(Plan.getEntry()))) {
2051     for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
2052       if (!isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(&R))
2053         continue;
2054       auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
2055       StringRef Name =
2056           isa<VPCanonicalIVPHIRecipe>(PhiR) ? "index" : "evl.based.iv";
2057       auto *ScalarR =
2058           new VPScalarPHIRecipe(PhiR->getStartValue(), PhiR->getBackedgeValue(),
2059                                 PhiR->getDebugLoc(), Name);
2060       ScalarR->insertBefore(PhiR);
2061       PhiR->replaceAllUsesWith(ScalarR);
2062       PhiR->eraseFromParent();
2063     }
2064   }
2065 }
2066 
2067 bool VPlanTransforms::handleUncountableEarlyExit(
2068     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
2069     BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
2070   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2071   auto *LatchVPBB = cast<VPBasicBlock>(LoopRegion->getExiting());
2072   VPBuilder Builder(LatchVPBB->getTerminator());
2073   auto *MiddleVPBB = Plan.getMiddleBlock();
2074   VPValue *IsEarlyExitTaken = nullptr;
2075 
2076   // Process the uncountable exiting block. Update IsEarlyExitTaken, which
2077   // tracks if the uncountable early exit has been taken. Also split the middle
2078   // block and have it conditionally branch to the early exit block if
2079   // EarlyExitTaken.
2080   auto *EarlyExitingBranch =
2081       cast<BranchInst>(UncountableExitingBlock->getTerminator());
2082   BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
2083   BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
2084 
2085   // The early exit block may or may not be the same as the "countable" exit
2086   // block. Creates a new VPIRBB for the early exit block in case it is distinct
2087   // from the countable exit block.
2088   // TODO: Introduce both exit blocks during VPlan skeleton construction.
2089   VPIRBasicBlock *VPEarlyExitBlock;
2090   if (OrigLoop->getUniqueExitBlock()) {
2091     VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
2092   } else {
2093     VPEarlyExitBlock = Plan.createVPIRBasicBlock(
2094         !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
2095   }
2096 
2097   VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
2098       OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
2099   auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
2100   IsEarlyExitTaken =
2101       Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
2102 
2103   VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
2104   VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
2105   VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
2106   NewMiddle->swapSuccessors();
2107 
2108   // Update the exit phis in the early exit block.
2109   VPBuilder MiddleBuilder(NewMiddle);
2110   for (VPRecipeBase &R : *VPEarlyExitBlock) {
2111     auto *ExitIRI = cast<VPIRInstruction>(&R);
2112     auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
2113     if (!ExitPhi)
2114       break;
2115 
2116     VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
2117         ExitPhi->getIncomingValueForBlock(UncountableExitingBlock));
2118     // The incoming value from the early exit must be a live-in for now.
2119     if (!IncomingFromEarlyExit->isLiveIn())
2120       return false;
2121 
2122     if (OrigLoop->getUniqueExitBlock()) {
2123       // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
2124       // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB
2125       // which is coming from the original latch.
2126       VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn(
2127           ExitPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
2128       ExitIRI->addOperand(IncomingFromLatch);
2129       ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
2130     }
2131     // Add the incoming value from the early exit.
2132     ExitIRI->addOperand(IncomingFromEarlyExit);
2133   }
2134   MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
2135 
2136   // Replace the condition controlling the non-early exit from the vector loop
2137   // with one exiting if either the original condition of the vector latch is
2138   // true or the early exit has been taken.
2139   auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
2140   assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2141          "Unexpected terminator");
2142   auto *IsLatchExitTaken =
2143       Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
2144                          LatchExitingBranch->getOperand(1));
2145   auto *AnyExitTaken = Builder.createNaryOp(
2146       Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
2147   Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
2148   LatchExitingBranch->eraseFromParent();
2149   return true;
2150 }
2151