xref: /llvm-project/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (revision 266ff98cba44b9e969e6894eaa30d4fe37647210)
1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements explicit unrolling for VPlans.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPRecipeBuilder.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanCFG.h"
18 #include "VPlanDominatorTree.h"
19 #include "VPlanPatternMatch.h"
20 #include "VPlanTransforms.h"
21 #include "VPlanUtils.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/ADT/SetVector.h"
26 #include "llvm/ADT/TypeSwitch.h"
27 #include "llvm/Analysis/IVDescriptors.h"
28 #include "llvm/Analysis/VectorUtils.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/PatternMatch.h"
31 
32 using namespace llvm;
33 
34 namespace {
35 
36 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
37 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
38 /// the unrolling transformation, where the original VPValues are retained for
39 /// part zero.
40 class UnrollState {
41   /// Plan to unroll.
42   VPlan &Plan;
43   /// Unroll factor to unroll by.
44   const unsigned UF;
45   /// Analysis for types.
46   VPTypeAnalysis TypeInfo;
47 
48   /// Unrolling may create recipes that should not be unrolled themselves.
49   /// Those are tracked in ToSkip.
50   SmallPtrSet<VPRecipeBase *, 8> ToSkip;
51 
52   // Associate with each VPValue of part 0 its unrolled instances of parts 1,
53   // ..., UF-1.
54   DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
55 
56   /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
57   void unrollReplicateRegionByUF(VPRegionBlock *VPR);
58 
59   /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
60   /// all parts.
61   void unrollRecipeByUF(VPRecipeBase &R);
62 
63   /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
64   /// depends on the concrete header phi. Inserts newly created recipes at \p
65   /// InsertPtForPhi.
66   void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
67                            VPBasicBlock::iterator InsertPtForPhi);
68 
69   /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
70   /// the induction steps for each part.
71   void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
72                                 VPBasicBlock::iterator InsertPtForPhi);
73 
74   VPValue *getConstantVPV(unsigned Part) {
75     Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
76     return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
77   }
78 
79 public:
80   UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
81       : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
82 
83   void unrollBlock(VPBlockBase *VPB);
84 
85   VPValue *getValueForPart(VPValue *V, unsigned Part) {
86     if (Part == 0 || V->isLiveIn())
87       return V;
88     assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
89            "accessed value does not exist");
90     return VPV2Parts[V][Part - 1];
91   }
92 
93   /// Given a single original recipe \p OrigR (of part zero), and its copy \p
94   /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
95   /// corresponding VPValue defined by \p CopyR.
96   void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
97                         unsigned Part) {
98     for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
99       auto Ins = VPV2Parts.insert({VPV, {}});
100       assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
101       Ins.first->second.push_back(CopyR->getVPValue(Idx));
102     }
103   }
104 
105   /// Given a uniform recipe \p R, add it for all parts.
106   void addUniformForAllParts(VPSingleDefRecipe *R) {
107     auto Ins = VPV2Parts.insert({R, {}});
108     assert(Ins.second && "uniform value already added");
109     for (unsigned Part = 0; Part != UF; ++Part)
110       Ins.first->second.push_back(R);
111   }
112 
113   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
114 
115   /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
116   /// \p P.
117   void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
118     auto *Op = R->getOperand(OpIdx);
119     R->setOperand(OpIdx, getValueForPart(Op, Part));
120   }
121 
122   /// Update \p R's operands with their corresponding VPValues for part \p P.
123   void remapOperands(VPRecipeBase *R, unsigned Part) {
124     for (const auto &[OpIdx, Op] : enumerate(R->operands()))
125       R->setOperand(OpIdx, getValueForPart(Op, Part));
126   }
127 };
128 } // namespace
129 
130 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
131   VPBlockBase *InsertPt = VPR->getSingleSuccessor();
132   for (unsigned Part = 1; Part != UF; ++Part) {
133     auto *Copy = VPR->clone();
134     VPBlockUtils::insertBlockBefore(Copy, InsertPt);
135 
136     auto PartI = vp_depth_first_shallow(Copy->getEntry());
137     auto Part0 = vp_depth_first_shallow(VPR->getEntry());
138     for (const auto &[PartIVPBB, Part0VPBB] :
139          zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
140              VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
141       for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
142         remapOperands(&PartIR, Part);
143         if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
144           ScalarIVSteps->addOperand(getConstantVPV(Part));
145         }
146 
147         addRecipeForPart(&Part0R, &PartIR, Part);
148       }
149     }
150   }
151 }
152 
153 void UnrollState::unrollWidenInductionByUF(
154     VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
155   VPBasicBlock *PH = cast<VPBasicBlock>(
156       IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
157   Type *IVTy = TypeInfo.inferScalarType(IV);
158   auto &ID = IV->getInductionDescriptor();
159   std::optional<FastMathFlags> FMFs;
160   if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
161     FMFs = ID.getInductionBinOp()->getFastMathFlags();
162 
163   VPValue *VectorStep = &Plan.getVF();
164   VPBuilder Builder(PH);
165   if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
166     Instruction::CastOps CastOp =
167         IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
168     VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
169     ToSkip.insert(VectorStep->getDefiningRecipe());
170   }
171 
172   VPValue *ScalarStep = IV->getStepValue();
173   auto *ConstStep = ScalarStep->isLiveIn()
174                         ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
175                         : nullptr;
176   if (!ConstStep || ConstStep->getZExtValue() != 1) {
177     if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
178       ScalarStep =
179           Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
180       ToSkip.insert(ScalarStep->getDefiningRecipe());
181     }
182 
183     unsigned MulOpc =
184         IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
185     VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep},
186                                               FMFs, IV->getDebugLoc());
187     VectorStep = Mul;
188     ToSkip.insert(Mul);
189   }
190 
191   // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
192   // remains the header phi. Parts > 0 are computed by adding Step to the
193   // previous part. The header phi recipe will get 2 new operands: the step
194   // value for a single part and the last part, used to compute the backedge
195   // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
196   // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
197   // %Part.1 = %Part.0 + %VectorStep
198   // %Part.2 = %Part.1 + %VectorStep
199   // %Part.3 = %Part.2 + %VectorStep
200   //
201   // The newly added recipes are added to ToSkip to avoid interleaving them
202   // again.
203   VPValue *Prev = IV;
204   Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
205   unsigned AddOpc =
206       IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
207   for (unsigned Part = 1; Part != UF; ++Part) {
208     std::string Name =
209         Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
210 
211     VPInstruction *Add = Builder.createNaryOp(AddOpc,
212                                               {
213                                                   Prev,
214                                                   VectorStep,
215                                               },
216                                               FMFs, IV->getDebugLoc(), Name);
217     ToSkip.insert(Add);
218     addRecipeForPart(IV, Add, Part);
219     Prev = Add;
220   }
221   IV->addOperand(VectorStep);
222   IV->addOperand(Prev);
223 }
224 
225 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
226                                       VPBasicBlock::iterator InsertPtForPhi) {
227   // First-order recurrences pass a single vector or scalar through their header
228   // phis, irrespective of interleaving.
229   if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
230     return;
231 
232   // Generate step vectors for each unrolled part.
233   if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
234     unrollWidenInductionByUF(IV, InsertPtForPhi);
235     return;
236   }
237 
238   auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
239   if (RdxPhi && RdxPhi->isOrdered())
240     return;
241 
242   auto InsertPt = std::next(R->getIterator());
243   for (unsigned Part = 1; Part != UF; ++Part) {
244     VPRecipeBase *Copy = R->clone();
245     Copy->insertBefore(*R->getParent(), InsertPt);
246     addRecipeForPart(R, Copy, Part);
247     if (isa<VPWidenPointerInductionRecipe>(R)) {
248       Copy->addOperand(R);
249       Copy->addOperand(getConstantVPV(Part));
250     } else if (RdxPhi) {
251       Copy->addOperand(getConstantVPV(Part));
252     } else {
253       assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
254              "unexpected header phi recipe not needing unrolled part");
255     }
256   }
257 }
258 
259 /// Handle non-header-phi recipes.
260 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
261   using namespace llvm::VPlanPatternMatch;
262   if (match(&R, m_BranchOnCond(m_VPValue())) ||
263       match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
264     return;
265 
266   if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
267     if (vputils::onlyFirstPartUsed(VPI)) {
268       addUniformForAllParts(VPI);
269       return;
270     }
271   }
272   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
273     if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
274         RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
275       // Stores to an invariant address only need to store the last part.
276       remapOperands(&R, UF - 1);
277       return;
278     }
279     if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
280       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
281         addUniformForAllParts(RepR);
282         return;
283       }
284     }
285   }
286 
287   // Unroll non-uniform recipes.
288   auto InsertPt = std::next(R.getIterator());
289   VPBasicBlock &VPBB = *R.getParent();
290   for (unsigned Part = 1; Part != UF; ++Part) {
291     VPRecipeBase *Copy = R.clone();
292     Copy->insertBefore(VPBB, InsertPt);
293     addRecipeForPart(&R, Copy, Part);
294 
295     VPValue *Op;
296     if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
297                       m_VPValue(), m_VPValue(Op)))) {
298       Copy->setOperand(0, getValueForPart(Op, Part - 1));
299       Copy->setOperand(1, getValueForPart(Op, Part));
300       continue;
301     }
302     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
303       auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
304       if (Phi->isOrdered()) {
305         auto Ins = VPV2Parts.insert({Phi, {}});
306         if (Part == 1) {
307           Ins.first->second.clear();
308           Ins.first->second.push_back(Red);
309         }
310         Ins.first->second.push_back(Copy->getVPSingleValue());
311         Phi->setOperand(1, Copy->getVPSingleValue());
312       }
313     }
314     remapOperands(Copy, Part);
315 
316     // Add operand indicating the part to generate code for, to recipes still
317     // requiring it.
318     if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
319             VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(Copy) ||
320         match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
321                         m_VPValue())))
322       Copy->addOperand(getConstantVPV(Part));
323 
324     if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
325       Copy->setOperand(0, R.getOperand(0));
326   }
327 }
328 
329 using namespace llvm::VPlanPatternMatch;
330 void UnrollState::unrollBlock(VPBlockBase *VPB) {
331   auto *VPR = dyn_cast<VPRegionBlock>(VPB);
332   if (VPR) {
333     if (VPR->isReplicator())
334       return unrollReplicateRegionByUF(VPR);
335 
336     // Traverse blocks in region in RPO to ensure defs are visited before uses
337     // across blocks.
338     ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
339         RPOT(VPR->getEntry());
340     for (VPBlockBase *VPB : RPOT)
341       unrollBlock(VPB);
342     return;
343   }
344 
345   // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
346   auto *VPBB = cast<VPBasicBlock>(VPB);
347   auto InsertPtForPhi = VPBB->getFirstNonPhi();
348   for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
349     if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
350       continue;
351 
352     // Add all VPValues for all parts to ComputeReductionResult which combines
353     // the parts to compute the final reduction value.
354     VPValue *Op1;
355     if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
356                       m_VPValue(), m_VPValue(Op1)))) {
357       addUniformForAllParts(cast<VPInstruction>(&R));
358       for (unsigned Part = 1; Part != UF; ++Part)
359         R.addOperand(getValueForPart(Op1, Part));
360       continue;
361     }
362     VPValue *Op0;
363     if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
364                       m_VPValue(Op0), m_VPValue(Op1)))) {
365       addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
366       if (Plan.hasScalarVFOnly()) {
367         // Extracting from end with VF = 1 implies retrieving the scalar part UF
368         // - Op1.
369         unsigned Offset =
370             cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
371         R.getVPSingleValue()->replaceAllUsesWith(
372             getValueForPart(Op0, UF - Offset));
373         R.eraseFromParent();
374       } else {
375         // Otherwise we extract from the last part.
376         remapOperands(&R, UF - 1);
377       }
378       continue;
379     }
380 
381     auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
382     if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
383       addUniformForAllParts(SingleDef);
384       continue;
385     }
386 
387     if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
388       unrollHeaderPHIByUF(H, InsertPtForPhi);
389       continue;
390     }
391 
392     unrollRecipeByUF(R);
393   }
394 }
395 
396 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
397   assert(UF > 0 && "Unroll factor must be positive");
398   Plan.setUF(UF);
399   auto Cleanup = make_scope_exit([&Plan]() {
400     auto Iter = vp_depth_first_deep(Plan.getEntry());
401     // Remove recipes that are redundant after unrolling.
402     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
403       for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
404         auto *VPI = dyn_cast<VPInstruction>(&R);
405         if (VPI &&
406             VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
407             VPI->getNumOperands() == 1) {
408           VPI->replaceAllUsesWith(VPI->getOperand(0));
409           VPI->eraseFromParent();
410         }
411       }
412     }
413   });
414   if (UF == 1) {
415     return;
416   }
417 
418   UnrollState Unroller(Plan, UF, Ctx);
419 
420   Unroller.unrollBlock(Plan.getPreheader());
421 
422   // Iterate over all blocks in the plan starting from Entry, and unroll
423   // recipes inside them. This includes the vector preheader and middle blocks,
424   // which may set up or post-process per-part values.
425   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
426       Plan.getEntry());
427   for (VPBlockBase *VPB : RPOT)
428     Unroller.unrollBlock(VPB);
429 
430   unsigned Part = 1;
431   // Remap operands of cloned header phis to update backedge values. The header
432   // phis cloned during unrolling are just after the header phi for part 0.
433   // Reset Part to 1 when reaching the first (part 0) recipe of a block.
434   for (VPRecipeBase &H :
435        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
436     // The second operand of Fixed Order Recurrence phi's, feeding the spliced
437     // value across the backedge, needs to remap to the last part of the spliced
438     // value.
439     if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
440       Unroller.remapOperand(&H, 1, UF - 1);
441       continue;
442     }
443     if (Unroller.contains(H.getVPSingleValue()) ||
444         isa<VPWidenPointerInductionRecipe>(&H)) {
445       Part = 1;
446       continue;
447     }
448     Unroller.remapOperands(&H, Part);
449     Part++;
450   }
451 
452   // Remap the operand of live-outs to the last part.
453   for (const auto &[_, LO] : Plan.getLiveOuts()) {
454     VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1);
455     LO->setOperand(0, In);
456   }
457 
458   VPlanTransforms::removeDeadRecipes(Plan);
459 }
460