xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (revision 6c8f41d3367476d35ac730abf9f980291737193b)
1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements explicit unrolling for VPlans.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPRecipeBuilder.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanCFG.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanTransforms.h"
20 #include "VPlanUtils.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/ScopeExit.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/IR/Intrinsics.h"
26 
27 using namespace llvm;
28 
29 namespace {
30 
31 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
32 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
33 /// the unrolling transformation, where the original VPValues are retained for
34 /// part zero.
35 class UnrollState {
36   /// Plan to unroll.
37   VPlan &Plan;
38   /// Unroll factor to unroll by.
39   const unsigned UF;
40   /// Analysis for types.
41   VPTypeAnalysis TypeInfo;
42 
43   /// Unrolling may create recipes that should not be unrolled themselves.
44   /// Those are tracked in ToSkip.
45   SmallPtrSet<VPRecipeBase *, 8> ToSkip;
46 
47   // Associate with each VPValue of part 0 its unrolled instances of parts 1,
48   // ..., UF-1.
49   DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
50 
51   /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
52   void unrollReplicateRegionByUF(VPRegionBlock *VPR);
53 
54   /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
55   /// all parts.
56   void unrollRecipeByUF(VPRecipeBase &R);
57 
58   /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
59   /// depends on the concrete header phi. Inserts newly created recipes at \p
60   /// InsertPtForPhi.
61   void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
62                            VPBasicBlock::iterator InsertPtForPhi);
63 
64   /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
65   /// the induction steps for each part.
66   void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
67                                 VPBasicBlock::iterator InsertPtForPhi);
68 
69   VPValue *getConstantVPV(unsigned Part) {
70     Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
71     return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
72   }
73 
74 public:
75   UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
76       : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
77 
78   void unrollBlock(VPBlockBase *VPB);
79 
80   VPValue *getValueForPart(VPValue *V, unsigned Part) {
81     if (Part == 0 || V->isLiveIn())
82       return V;
83     assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
84            "accessed value does not exist");
85     return VPV2Parts[V][Part - 1];
86   }
87 
88   /// Given a single original recipe \p OrigR (of part zero), and its copy \p
89   /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
90   /// corresponding VPValue defined by \p CopyR.
91   void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
92                         unsigned Part) {
93     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
94       auto Ins = VPV2Parts.insert({VPV, {}});
95       assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
96       Ins.first->second.push_back(CopyR->getVPValue(Idx));
97     }
98   }
99 
100   /// Given a uniform recipe \p R, add it for all parts.
101   void addUniformForAllParts(VPSingleDefRecipe *R) {
102     auto Ins = VPV2Parts.insert({R, {}});
103     assert(Ins.second && "uniform value already added");
104     for (unsigned Part = 0; Part != UF; ++Part)
105       Ins.first->second.push_back(R);
106   }
107 
108   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
109 
110   /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
111   /// \p P.
112   void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
113     auto *Op = R->getOperand(OpIdx);
114     R->setOperand(OpIdx, getValueForPart(Op, Part));
115   }
116 
117   /// Update \p R's operands with their corresponding VPValues for part \p P.
118   void remapOperands(VPRecipeBase *R, unsigned Part) {
119     for (const auto &[OpIdx, Op] : enumerate(R->operands()))
120       R->setOperand(OpIdx, getValueForPart(Op, Part));
121   }
122 };
123 } // namespace
124 
125 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
126   VPBlockBase *InsertPt = VPR->getSingleSuccessor();
127   for (unsigned Part = 1; Part != UF; ++Part) {
128     auto *Copy = VPR->clone();
129     VPBlockUtils::insertBlockBefore(Copy, InsertPt);
130 
131     auto PartI = vp_depth_first_shallow(Copy->getEntry());
132     auto Part0 = vp_depth_first_shallow(VPR->getEntry());
133     for (const auto &[PartIVPBB, Part0VPBB] :
134          zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
135              VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
136       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
137         remapOperands(&PartIR, Part);
138         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
139           ScalarIVSteps->addOperand(getConstantVPV(Part));
140         }
141 
142         addRecipeForPart(&Part0R, &PartIR, Part);
143       }
144     }
145   }
146 }
147 
148 void UnrollState::unrollWidenInductionByUF(
149     VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
150   VPBasicBlock *PH = cast<VPBasicBlock>(
151       IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
152   Type *IVTy = TypeInfo.inferScalarType(IV);
153   auto &ID = IV->getInductionDescriptor();
154   std::optional<FastMathFlags> FMFs;
155   if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
156     FMFs = ID.getInductionBinOp()->getFastMathFlags();
157 
158   VPValue *VectorStep = &Plan.getVF();
159   VPBuilder Builder(PH);
160   if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
161     Instruction::CastOps CastOp =
162         IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
163     VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
164     ToSkip.insert(VectorStep->getDefiningRecipe());
165   }
166 
167   VPValue *ScalarStep = IV->getStepValue();
168   auto *ConstStep = ScalarStep->isLiveIn()
169                         ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
170                         : nullptr;
171   if (!ConstStep || ConstStep->getValue() != 1) {
172     if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
173       ScalarStep =
174           Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
175       ToSkip.insert(ScalarStep->getDefiningRecipe());
176     }
177 
178     unsigned MulOpc =
179         IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
180     VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep},
181                                               FMFs, IV->getDebugLoc());
182     VectorStep = Mul;
183     ToSkip.insert(Mul);
184   }
185 
186   // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
187   // remains the header phi. Parts > 0 are computed by adding Step to the
188   // previous part. The header phi recipe will get 2 new operands: the step
189   // value for a single part and the last part, used to compute the backedge
190   // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
191   // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
192   // %Part.1 = %Part.0 + %VectorStep
193   // %Part.2 = %Part.1 + %VectorStep
194   // %Part.3 = %Part.2 + %VectorStep
195   //
196   // The newly added recipes are added to ToSkip to avoid interleaving them
197   // again.
198   VPValue *Prev = IV;
199   Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
200   unsigned AddOpc =
201       IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
202   for (unsigned Part = 1; Part != UF; ++Part) {
203     std::string Name =
204         Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
205 
206     VPInstruction *Add = Builder.createNaryOp(AddOpc,
207                                               {
208                                                   Prev,
209                                                   VectorStep,
210                                               },
211                                               FMFs, IV->getDebugLoc(), Name);
212     ToSkip.insert(Add);
213     addRecipeForPart(IV, Add, Part);
214     Prev = Add;
215   }
216   IV->addOperand(VectorStep);
217   IV->addOperand(Prev);
218 }
219 
220 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
221                                       VPBasicBlock::iterator InsertPtForPhi) {
222   // First-order recurrences pass a single vector or scalar through their header
223   // phis, irrespective of interleaving.
224   if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
225     return;
226 
227   // Generate step vectors for each unrolled part.
228   if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
229     unrollWidenInductionByUF(IV, InsertPtForPhi);
230     return;
231   }
232 
233   auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
234   if (RdxPhi && RdxPhi->isOrdered())
235     return;
236 
237   auto InsertPt = std::next(R->getIterator());
238   for (unsigned Part = 1; Part != UF; ++Part) {
239     VPRecipeBase *Copy = R->clone();
240     Copy->insertBefore(*R->getParent(), InsertPt);
241     addRecipeForPart(R, Copy, Part);
242     if (isa<VPWidenPointerInductionRecipe>(R)) {
243       Copy->addOperand(R);
244       Copy->addOperand(getConstantVPV(Part));
245     } else if (RdxPhi) {
246       Copy->addOperand(getConstantVPV(Part));
247     } else {
248       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
249              "unexpected header phi recipe not needing unrolled part");
250     }
251   }
252 }
253 
254 /// Handle non-header-phi recipes.
255 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
256   using namespace llvm::VPlanPatternMatch;
257   if (match(&R, m_BranchOnCond(m_VPValue())) ||
258       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
259     return;
260 
261   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
262     if (vputils::onlyFirstPartUsed(VPI)) {
263       addUniformForAllParts(VPI);
264       return;
265     }
266   }
267   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
268     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
269         RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
270       // Stores to an invariant address only need to store the last part.
271       remapOperands(&R, UF - 1);
272       return;
273     }
274     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
275       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
276         addUniformForAllParts(RepR);
277         return;
278       }
279     }
280   }
281 
282   // Unroll non-uniform recipes.
283   auto InsertPt = std::next(R.getIterator());
284   VPBasicBlock &VPBB = *R.getParent();
285   for (unsigned Part = 1; Part != UF; ++Part) {
286     VPRecipeBase *Copy = R.clone();
287     Copy->insertBefore(VPBB, InsertPt);
288     addRecipeForPart(&R, Copy, Part);
289 
290     VPValue *Op;
291     if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
292                       m_VPValue(), m_VPValue(Op)))) {
293       Copy->setOperand(0, getValueForPart(Op, Part - 1));
294       Copy->setOperand(1, getValueForPart(Op, Part));
295       continue;
296     }
297     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
298       auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
299       if (Phi->isOrdered()) {
300         auto &Parts = VPV2Parts[Phi];
301         if (Part == 1) {
302           Parts.clear();
303           Parts.push_back(Red);
304         }
305         Parts.push_back(Copy->getVPSingleValue());
306         Phi->setOperand(1, Copy->getVPSingleValue());
307       }
308     }
309     remapOperands(Copy, Part);
310 
311     // Add operand indicating the part to generate code for, to recipes still
312     // requiring it.
313     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
314             VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(Copy) ||
315         match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
316                         m_VPValue())))
317       Copy->addOperand(getConstantVPV(Part));
318 
319     if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
320       Copy->setOperand(0, R.getOperand(0));
321   }
322 }
323 
324 using namespace llvm::VPlanPatternMatch;
325 void UnrollState::unrollBlock(VPBlockBase *VPB) {
326   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
327   if (VPR) {
328     if (VPR->isReplicator())
329       return unrollReplicateRegionByUF(VPR);
330 
331     // Traverse blocks in region in RPO to ensure defs are visited before uses
332     // across blocks.
333     ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
334         RPOT(VPR->getEntry());
335     for (VPBlockBase *VPB : RPOT)
336       unrollBlock(VPB);
337     return;
338   }
339 
340   // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
341   auto *VPBB = cast<VPBasicBlock>(VPB);
342   auto InsertPtForPhi = VPBB->getFirstNonPhi();
343   for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
344     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
345       continue;
346 
347     // Add all VPValues for all parts to ComputeReductionResult which combines
348     // the parts to compute the final reduction value.
349     VPValue *Op1;
350     if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
351                       m_VPValue(), m_VPValue(Op1)))) {
352       addUniformForAllParts(cast<VPInstruction>(&R));
353       for (unsigned Part = 1; Part != UF; ++Part)
354         R.addOperand(getValueForPart(Op1, Part));
355       continue;
356     }
357     VPValue *Op0;
358     if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
359                       m_VPValue(Op0), m_VPValue(Op1)))) {
360       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
361       if (Plan.hasScalarVFOnly()) {
362         // Extracting from end with VF = 1 implies retrieving the scalar part UF
363         // - Op1.
364         unsigned Offset =
365             cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
366         R.getVPSingleValue()->replaceAllUsesWith(
367             getValueForPart(Op0, UF - Offset));
368         R.eraseFromParent();
369       } else {
370         // Otherwise we extract from the last part.
371         remapOperands(&R, UF - 1);
372       }
373       continue;
374     }
375 
376     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
377     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
378       addUniformForAllParts(SingleDef);
379       continue;
380     }
381 
382     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
383       unrollHeaderPHIByUF(H, InsertPtForPhi);
384       continue;
385     }
386 
387     unrollRecipeByUF(R);
388   }
389 }
390 
391 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
392   assert(UF > 0 && "Unroll factor must be positive");
393   Plan.setUF(UF);
394   auto Cleanup = make_scope_exit([&Plan]() {
395     auto Iter = vp_depth_first_deep(Plan.getEntry());
396     // Remove recipes that are redundant after unrolling.
397     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
398       for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
399         auto *VPI = dyn_cast<VPInstruction>(&R);
400         if (VPI &&
401             VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
402             VPI->getNumOperands() == 1) {
403           VPI->replaceAllUsesWith(VPI->getOperand(0));
404           VPI->eraseFromParent();
405         }
406       }
407     }
408   });
409   if (UF == 1) {
410     return;
411   }
412 
413   UnrollState Unroller(Plan, UF, Ctx);
414 
415   // Iterate over all blocks in the plan starting from Entry, and unroll
416   // recipes inside them. This includes the vector preheader and middle blocks,
417   // which may set up or post-process per-part values.
418   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
419       Plan.getEntry());
420   for (VPBlockBase *VPB : RPOT)
421     Unroller.unrollBlock(VPB);
422 
423   unsigned Part = 1;
424   // Remap operands of cloned header phis to update backedge values. The header
425   // phis cloned during unrolling are just after the header phi for part 0.
426   // Reset Part to 1 when reaching the first (part 0) recipe of a block.
427   for (VPRecipeBase &H :
428        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
429     // The second operand of Fixed Order Recurrence phi's, feeding the spliced
430     // value across the backedge, needs to remap to the last part of the spliced
431     // value.
432     if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
433       Unroller.remapOperand(&H, 1, UF - 1);
434       continue;
435     }
436     if (Unroller.contains(H.getVPSingleValue()) ||
437         isa<VPWidenPointerInductionRecipe>(&H)) {
438       Part = 1;
439       continue;
440     }
441     Unroller.remapOperands(&H, Part);
442     Part++;
443   }
444 
445   VPlanTransforms::removeDeadRecipes(Plan);
446 }
447