xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (revision 53266f73f037bd20bcbbd7852fd0c6a7703b4c38)
1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements explicit unrolling for VPlans.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPRecipeBuilder.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanCFG.h"
18 #include "VPlanDominatorTree.h"
19 #include "VPlanPatternMatch.h"
20 #include "VPlanTransforms.h"
21 #include "VPlanUtils.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/ADT/SetVector.h"
26 #include "llvm/ADT/TypeSwitch.h"
27 #include "llvm/Analysis/IVDescriptors.h"
28 #include "llvm/Analysis/VectorUtils.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/PatternMatch.h"
31 
32 using namespace llvm;
33 
34 namespace {
35 
36 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
37 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
38 /// the unrolling transformation, where the original VPValues are retained for
39 /// part zero.
40 class UnrollState {
41   /// Plan to unroll.
42   VPlan &Plan;
43   /// Unroll factor to unroll by.
44   const unsigned UF;
45   /// Analysis for types.
46   VPTypeAnalysis TypeInfo;
47 
48   /// Unrolling may create recipes that should not be unrolled themselves.
49   /// Those are tracked in ToSkip.
50   SmallPtrSet<VPRecipeBase *, 8> ToSkip;
51 
52   // Associate with each VPValue of part 0 its unrolled instances of parts 1,
53   // ..., UF-1.
54   DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
55 
56   /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
57   void unrollReplicateRegionByUF(VPRegionBlock *VPR);
58 
59   /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
60   /// all parts.
61   void unrollRecipeByUF(VPRecipeBase &R);
62 
63   /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
64   /// depends on the concrete header phi. Inserts newly created recipes at \p
65   /// InsertPtForPhi.
66   void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
67                            VPBasicBlock::iterator InsertPtForPhi);
68 
69   /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
70   /// the induction steps for each part.
71   void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
72                                 VPBasicBlock::iterator InsertPtForPhi);
73 
74   VPValue *getConstantVPV(unsigned Part) {
75     Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
76     return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
77   }
78 
79 public:
80   UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
81       : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
82 
83   void unrollBlock(VPBlockBase *VPB);
84 
85   VPValue *getValueForPart(VPValue *V, unsigned Part) {
86     if (Part == 0 || V->isLiveIn())
87       return V;
88     assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
89            "accessed value does not exist");
90     return VPV2Parts[V][Part - 1];
91   }
92 
93   /// Given a single original recipe \p OrigR (of part zero), and its copy \p
94   /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
95   /// corresponding VPValue defined by \p CopyR.
96   void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
97                         unsigned Part) {
98     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
99       auto Ins = VPV2Parts.insert({VPV, {}});
100       assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
101       Ins.first->second.push_back(CopyR->getVPValue(Idx));
102     }
103   }
104 
105   /// Given a uniform recipe \p R, add it for all parts.
106   void addUniformForAllParts(VPSingleDefRecipe *R) {
107     auto Ins = VPV2Parts.insert({R, {}});
108     assert(Ins.second && "uniform value already added");
109     for (unsigned Part = 0; Part != UF; ++Part)
110       Ins.first->second.push_back(R);
111   }
112 
113   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
114 
115   /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
116   /// \p P.
117   void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
118     auto *Op = R->getOperand(OpIdx);
119     R->setOperand(OpIdx, getValueForPart(Op, Part));
120   }
121 
122   /// Update \p R's operands with their corresponding VPValues for part \p P.
123   void remapOperands(VPRecipeBase *R, unsigned Part) {
124     for (const auto &[OpIdx, Op] : enumerate(R->operands()))
125       R->setOperand(OpIdx, getValueForPart(Op, Part));
126   }
127 };
128 } // namespace
129 
130 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
131   VPBlockBase *InsertPt = VPR->getSingleSuccessor();
132   for (unsigned Part = 1; Part != UF; ++Part) {
133     auto *Copy = VPR->clone();
134     VPBlockUtils::insertBlockBefore(Copy, InsertPt);
135 
136     auto PartI = vp_depth_first_shallow(Copy->getEntry());
137     auto Part0 = vp_depth_first_shallow(VPR->getEntry());
138     for (const auto &[PartIVPBB, Part0VPBB] :
139          zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
140              VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
141       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
142         remapOperands(&PartIR, Part);
143         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
144           ScalarIVSteps->addOperand(getConstantVPV(Part));
145         }
146 
147         addRecipeForPart(&Part0R, &PartIR, Part);
148       }
149     }
150   }
151 }
152 
153 void UnrollState::unrollWidenInductionByUF(
154     VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
155   VPBasicBlock *PH = cast<VPBasicBlock>(
156       IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
157   Type *IVTy = TypeInfo.inferScalarType(IV);
158   auto &ID = IV->getInductionDescriptor();
159   std::optional<FastMathFlags> FMFs;
160   if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
161     FMFs = ID.getInductionBinOp()->getFastMathFlags();
162 
163   VPValue *VectorStep = &Plan.getVF();
164   VPBuilder Builder(PH);
165   if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
166     Instruction::CastOps CastOp =
167         IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
168     VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
169     ToSkip.insert(VectorStep->getDefiningRecipe());
170   }
171 
172   VPValue *ScalarStep = IV->getStepValue();
173   auto *ConstStep = ScalarStep->isLiveIn()
174                         ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
175                         : nullptr;
176   if (!ConstStep || ConstStep->getZExtValue() != 1) {
177     if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
178       ScalarStep =
179           Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
180       ToSkip.insert(ScalarStep->getDefiningRecipe());
181     }
182 
183     unsigned MulOpc =
184         IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
185     VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep},
186                                               FMFs, IV->getDebugLoc());
187     VectorStep = Mul;
188     ToSkip.insert(Mul);
189   }
190 
191   // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
192   // remains the header phi. Parts > 0 are computed by adding Step to the
193   // previous part. The header phi recipe will get 2 new operands: the step
194   // value for a single part and the last part, used to compute the backedge
195   // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
196   // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
197   // %Part.1 = %Part.0 + %VectorStep
198   // %Part.2 = %Part.1 + %VectorStep
199   // %Part.3 = %Part.2 + %VectorStep
200   //
201   // The newly added recipes are added to ToSkip to avoid interleaving them
202   // again.
203   VPValue *Prev = IV;
204   Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
205   unsigned AddOpc =
206       IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
207   for (unsigned Part = 1; Part != UF; ++Part) {
208     std::string Name =
209         Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
210 
211     VPInstruction *Add = Builder.createNaryOp(AddOpc,
212                                               {
213                                                   Prev,
214                                                   VectorStep,
215                                               },
216                                               FMFs, IV->getDebugLoc(), Name);
217     ToSkip.insert(Add);
218     addRecipeForPart(IV, Add, Part);
219     Prev = Add;
220   }
221   IV->addOperand(VectorStep);
222   IV->addOperand(Prev);
223 }
224 
225 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
226                                       VPBasicBlock::iterator InsertPtForPhi) {
227   // First-order recurrences pass a single vector or scalar through their header
228   // phis, irrespective of interleaving.
229   if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
230     return;
231 
232   // Generate step vectors for each unrolled part.
233   if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
234     unrollWidenInductionByUF(IV, InsertPtForPhi);
235     return;
236   }
237 
238   auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
239   if (RdxPhi && RdxPhi->isOrdered())
240     return;
241 
242   auto InsertPt = std::next(R->getIterator());
243   for (unsigned Part = 1; Part != UF; ++Part) {
244     VPRecipeBase *Copy = R->clone();
245     Copy->insertBefore(*R->getParent(), InsertPt);
246     addRecipeForPart(R, Copy, Part);
247     if (isa<VPWidenPointerInductionRecipe>(R)) {
248       Copy->addOperand(R);
249       Copy->addOperand(getConstantVPV(Part));
250     } else if (RdxPhi) {
251       Copy->addOperand(getConstantVPV(Part));
252     } else {
253       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
254              "unexpected header phi recipe not needing unrolled part");
255     }
256   }
257 }
258 
259 /// Handle non-header-phi recipes.
260 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
261   using namespace llvm::VPlanPatternMatch;
262   if (match(&R, m_BranchOnCond(m_VPValue())) ||
263       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
264     return;
265 
266   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
267     VPValue *Op0, *Op1;
268     if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
269                        m_VPValue(Op0), m_VPValue(Op1)))) {
270       VPI->setOperand(1, getValueForPart(Op1, UF - 1));
271       addUniformForAllParts(VPI);
272       if (Plan.hasScalarVFOnly()) {
273         // Extracting from end with VF = 1 implies retrieving the scalar part UF
274         // - Op1.
275         unsigned Offset =
276             cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
277         VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
278       } else {
279         // Otherwise we extract from the last part.
280         remapOperands(VPI, UF - 1);
281       }
282       return;
283     }
284 
285     if (vputils::onlyFirstPartUsed(VPI)) {
286       addUniformForAllParts(VPI);
287       return;
288     }
289   }
290   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
291     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
292         RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
293       // Stores to an invariant address only need to store the last part.
294       remapOperands(&R, UF - 1);
295       return;
296     }
297     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
298       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
299         addUniformForAllParts(RepR);
300         return;
301       }
302     }
303   }
304 
305   // Unroll non-uniform recipes.
306   auto InsertPt = std::next(R.getIterator());
307   VPBasicBlock &VPBB = *R.getParent();
308   for (unsigned Part = 1; Part != UF; ++Part) {
309     VPRecipeBase *Copy = R.clone();
310     Copy->insertBefore(VPBB, InsertPt);
311     addRecipeForPart(&R, Copy, Part);
312 
313     VPValue *Op;
314     if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
315                       m_VPValue(), m_VPValue(Op)))) {
316       Copy->setOperand(0, getValueForPart(Op, Part - 1));
317       Copy->setOperand(1, getValueForPart(Op, Part));
318       continue;
319     }
320     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
321       auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
322       if (Phi->isOrdered()) {
323         auto Ins = VPV2Parts.insert({Phi, {}});
324         if (Part == 1) {
325           Ins.first->second.clear();
326           Ins.first->second.push_back(Red);
327         }
328         Ins.first->second.push_back(Copy->getVPSingleValue());
329         Phi->setOperand(1, Copy->getVPSingleValue());
330       }
331     }
332     remapOperands(Copy, Part);
333 
334     // Add operand indicating the part to generate code for, to recipes still
335     // requiring it.
336     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
337             VPVectorPointerRecipe>(Copy) ||
338         match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
339                         m_VPValue())))
340       Copy->addOperand(getConstantVPV(Part));
341 
342     if (isa<VPVectorPointerRecipe>(R))
343       Copy->setOperand(0, R.getOperand(0));
344   }
345 }
346 
347 using namespace llvm::VPlanPatternMatch;
348 void UnrollState::unrollBlock(VPBlockBase *VPB) {
349   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
350   if (VPR) {
351     if (VPR->isReplicator())
352       return unrollReplicateRegionByUF(VPR);
353 
354     // Traverse blocks in region in RPO to ensure defs are visited before uses
355     // across blocks.
356     ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
357         RPOT(VPR->getEntry());
358     for (VPBlockBase *VPB : RPOT)
359       unrollBlock(VPB);
360     return;
361   }
362 
363   // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
364   auto *VPBB = cast<VPBasicBlock>(VPB);
365   auto InsertPtForPhi = VPBB->getFirstNonPhi();
366   for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
367     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
368       continue;
369 
370     // Add all VPValues for all parts to ComputeReductionResult which combines
371     // the parts to compute the final reduction value.
372     VPValue *Op1;
373     if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
374                       m_VPValue(), m_VPValue(Op1)))) {
375       addUniformForAllParts(cast<VPInstruction>(&R));
376       for (unsigned Part = 1; Part != UF; ++Part)
377         R.addOperand(getValueForPart(Op1, Part));
378       continue;
379     }
380     VPValue *Op0;
381     if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
382                       m_VPValue(Op0), m_VPValue(Op1)))) {
383       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
384       if (Plan.hasScalarVFOnly()) {
385         // Extracting from end with VF = 1 implies retrieving the scalar part UF
386         // - Op1.
387         unsigned Offset =
388             cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
389         R.getVPSingleValue()->replaceAllUsesWith(
390             getValueForPart(Op0, UF - Offset));
391         R.eraseFromParent();
392       } else {
393         // Otherwise we extract from the last part.
394         remapOperands(&R, UF - 1);
395       }
396       continue;
397     }
398 
399     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
400     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
401       addUniformForAllParts(SingleDef);
402       continue;
403     }
404 
405     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
406       unrollHeaderPHIByUF(H, InsertPtForPhi);
407       continue;
408     }
409 
410     unrollRecipeByUF(R);
411   }
412 }
413 
414 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
415   assert(UF > 0 && "Unroll factor must be positive");
416   Plan.setUF(UF);
417   auto Cleanup = make_scope_exit([&Plan]() {
418     auto Iter = vp_depth_first_deep(Plan.getEntry());
419     // Remove recipes that are redundant after unrolling.
420     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
421       for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
422         auto *VPI = dyn_cast<VPInstruction>(&R);
423         if (VPI &&
424             VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
425             VPI->getNumOperands() == 1) {
426           VPI->replaceAllUsesWith(VPI->getOperand(0));
427           VPI->eraseFromParent();
428         }
429       }
430     }
431   });
432   if (UF == 1) {
433     return;
434   }
435 
436   UnrollState Unroller(Plan, UF, Ctx);
437 
438   Unroller.unrollBlock(Plan.getPreheader());
439 
440   // Iterate over all blocks in the plan starting from Entry, and unroll
441   // recipes inside them. This includes the vector preheader and middle blocks,
442   // which may set up or post-process per-part values.
443   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
444       Plan.getEntry());
445   for (VPBlockBase *VPB : RPOT)
446     Unroller.unrollBlock(VPB);
447 
448   unsigned Part = 1;
449   // Remap operands of cloned header phis to update backedge values. The header
450   // phis cloned during unrolling are just after the header phi for part 0.
451   // Reset Part to 1 when reaching the first (part 0) recipe of a block.
452   for (VPRecipeBase &H :
453        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
454     // The second operand of Fixed Order Recurrence phi's, feeding the spliced
455     // value across the backedge, needs to remap to the last part of the spliced
456     // value.
457     if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
458       Unroller.remapOperand(&H, 1, UF - 1);
459       continue;
460     }
461     if (Unroller.contains(H.getVPSingleValue()) ||
462         isa<VPWidenPointerInductionRecipe>(&H)) {
463       Part = 1;
464       continue;
465     }
466     Unroller.remapOperands(&H, Part);
467     Part++;
468   }
469 
470   // Remap the operand of live-outs to the last part.
471   for (const auto &[_, LO] : Plan.getLiveOuts()) {
472     VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1);
473     LO->setOperand(0, In);
474   }
475 
476   VPlanTransforms::removeDeadRecipes(Plan);
477 }
478