//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file implements explicit unrolling for VPlans. /// //===----------------------------------------------------------------------===// #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanCFG.h" #include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/Intrinsics.h" using namespace llvm; namespace { /// Helper to hold state needed for unrolling. It holds the Plan to unroll by /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate /// the unrolling transformation, where the original VPValues are retained for /// part zero. class UnrollState { /// Plan to unroll. VPlan &Plan; /// Unroll factor to unroll by. const unsigned UF; /// Analysis for types. VPTypeAnalysis TypeInfo; /// Unrolling may create recipes that should not be unrolled themselves. /// Those are tracked in ToSkip. SmallPtrSet ToSkip; // Associate with each VPValue of part 0 its unrolled instances of parts 1, // ..., UF-1. DenseMap> VPV2Parts; /// Unroll replicate region \p VPR by cloning the region UF - 1 times. void unrollReplicateRegionByUF(VPRegionBlock *VPR); /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across /// all parts. void unrollRecipeByUF(VPRecipeBase &R); /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled /// depends on the concrete header phi. Inserts newly created recipes at \p /// InsertPtForPhi. void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, VPBasicBlock::iterator InsertPtForPhi); /// Unroll a widen induction recipe \p IV. This introduces recipes to compute /// the induction steps for each part. void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi); VPValue *getConstantVPV(unsigned Part) { Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); } public: UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} void unrollBlock(VPBlockBase *VPB); VPValue *getValueForPart(VPValue *V, unsigned Part) { if (Part == 0 || V->isLiveIn()) return V; assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && "accessed value does not exist"); return VPV2Parts[V][Part - 1]; } /// Given a single original recipe \p OrigR (of part zero), and its copy \p /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its /// corresponding VPValue defined by \p CopyR. void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, unsigned Part) { for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { auto Ins = VPV2Parts.insert({VPV, {}}); assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); Ins.first->second.push_back(CopyR->getVPValue(Idx)); } } /// Given a uniform recipe \p R, add it for all parts. void addUniformForAllParts(VPSingleDefRecipe *R) { auto Ins = VPV2Parts.insert({R, {}}); assert(Ins.second && "uniform value already added"); for (unsigned Part = 0; Part != UF; ++Part) Ins.first->second.push_back(R); } bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part /// \p P. void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { auto *Op = R->getOperand(OpIdx); R->setOperand(OpIdx, getValueForPart(Op, Part)); } /// Update \p R's operands with their corresponding VPValues for part \p P. void remapOperands(VPRecipeBase *R, unsigned Part) { for (const auto &[OpIdx, Op] : enumerate(R->operands())) R->setOperand(OpIdx, getValueForPart(Op, Part)); } }; } // namespace void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { VPBlockBase *InsertPt = VPR->getSingleSuccessor(); for (unsigned Part = 1; Part != UF; ++Part) { auto *Copy = VPR->clone(); VPBlockUtils::insertBlockBefore(Copy, InsertPt); auto PartI = vp_depth_first_shallow(Copy->getEntry()); auto Part0 = vp_depth_first_shallow(VPR->getEntry()); for (const auto &[PartIVPBB, Part0VPBB] : zip(VPBlockUtils::blocksOnly(PartI), VPBlockUtils::blocksOnly(Part0))) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast(&PartIR)) { ScalarIVSteps->addOperand(getConstantVPV(Part)); } addRecipeForPart(&Part0R, &PartIR, Part); } } } } void UnrollState::unrollWidenInductionByUF( VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { VPBasicBlock *PH = cast( IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); Type *IVTy = TypeInfo.inferScalarType(IV); auto &ID = IV->getInductionDescriptor(); std::optional FMFs; if (isa_and_present(ID.getInductionBinOp())) FMFs = ID.getInductionBinOp()->getFastMathFlags(); VPValue *VectorStep = &Plan.getVF(); VPBuilder Builder(PH); if (TypeInfo.inferScalarType(VectorStep) != IVTy) { Instruction::CastOps CastOp = IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc; VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); ToSkip.insert(VectorStep->getDefiningRecipe()); } VPValue *ScalarStep = IV->getStepValue(); auto *ConstStep = ScalarStep->isLiveIn() ? dyn_cast(ScalarStep->getLiveInIRValue()) : nullptr; if (!ConstStep || ConstStep->getValue() != 1) { if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { ScalarStep = Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); ToSkip.insert(ScalarStep->getDefiningRecipe()); } unsigned MulOpc = IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep}, FMFs, IV->getDebugLoc()); VectorStep = Mul; ToSkip.insert(Mul); } // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 // remains the header phi. Parts > 0 are computed by adding Step to the // previous part. The header phi recipe will get 2 new operands: the step // value for a single part and the last part, used to compute the backedge // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 // %Part.1 = %Part.0 + %VectorStep // %Part.2 = %Part.1 + %VectorStep // %Part.3 = %Part.2 + %VectorStep // // The newly added recipes are added to ToSkip to avoid interleaving them // again. VPValue *Prev = IV; Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); unsigned AddOpc = IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; for (unsigned Part = 1; Part != UF; ++Part) { std::string Name = Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; VPInstruction *Add = Builder.createNaryOp(AddOpc, { Prev, VectorStep, }, FMFs, IV->getDebugLoc(), Name); ToSkip.insert(Add); addRecipeForPart(IV, Add, Part); Prev = Add; } IV->addOperand(VectorStep); IV->addOperand(Prev); } void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, VPBasicBlock::iterator InsertPtForPhi) { // First-order recurrences pass a single vector or scalar through their header // phis, irrespective of interleaving. if (isa(R)) return; // Generate step vectors for each unrolled part. if (auto *IV = dyn_cast(R)) { unrollWidenInductionByUF(IV, InsertPtForPhi); return; } auto *RdxPhi = dyn_cast(R); if (RdxPhi && RdxPhi->isOrdered()) return; auto InsertPt = std::next(R->getIterator()); for (unsigned Part = 1; Part != UF; ++Part) { VPRecipeBase *Copy = R->clone(); Copy->insertBefore(*R->getParent(), InsertPt); addRecipeForPart(R, Copy, Part); if (isa(R)) { Copy->addOperand(R); Copy->addOperand(getConstantVPV(Part)); } else if (RdxPhi) { Copy->addOperand(getConstantVPV(Part)); } else { assert(isa(R) && "unexpected header phi recipe not needing unrolled part"); } } } /// Handle non-header-phi recipes. void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { using namespace llvm::VPlanPatternMatch; if (match(&R, m_BranchOnCond(m_VPValue())) || match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) return; if (auto *VPI = dyn_cast(&R)) { if (vputils::onlyFirstPartUsed(VPI)) { addUniformForAllParts(VPI); return; } } if (auto *RepR = dyn_cast(&R)) { if (isa(RepR->getUnderlyingValue()) && RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { // Stores to an invariant address only need to store the last part. remapOperands(&R, UF - 1); return; } if (auto *II = dyn_cast(RepR->getUnderlyingValue())) { if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { addUniformForAllParts(RepR); return; } } } // Unroll non-uniform recipes. auto InsertPt = std::next(R.getIterator()); VPBasicBlock &VPBB = *R.getParent(); for (unsigned Part = 1; Part != UF; ++Part) { VPRecipeBase *Copy = R.clone(); Copy->insertBefore(VPBB, InsertPt); addRecipeForPart(&R, Copy, Part); VPValue *Op; if (match(&R, m_VPInstruction( m_VPValue(), m_VPValue(Op)))) { Copy->setOperand(0, getValueForPart(Op, Part - 1)); Copy->setOperand(1, getValueForPart(Op, Part)); continue; } if (auto *Red = dyn_cast(&R)) { auto *Phi = cast(R.getOperand(0)); if (Phi->isOrdered()) { auto &Parts = VPV2Parts[Phi]; if (Part == 1) { Parts.clear(); Parts.push_back(Red); } Parts.push_back(Copy->getVPSingleValue()); Phi->setOperand(1, Copy->getVPSingleValue()); } } remapOperands(Copy, Part); // Add operand indicating the part to generate code for, to recipes still // requiring it. if (isa(Copy) || match(Copy, m_VPInstruction( m_VPValue()))) Copy->addOperand(getConstantVPV(Part)); if (isa(R)) Copy->setOperand(0, R.getOperand(0)); } } using namespace llvm::VPlanPatternMatch; void UnrollState::unrollBlock(VPBlockBase *VPB) { auto *VPR = dyn_cast(VPB); if (VPR) { if (VPR->isReplicator()) return unrollReplicateRegionByUF(VPR); // Traverse blocks in region in RPO to ensure defs are visited before uses // across blocks. ReversePostOrderTraversal> RPOT(VPR->getEntry()); for (VPBlockBase *VPB : RPOT) unrollBlock(VPB); return; } // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. auto *VPBB = cast(VPB); auto InsertPtForPhi = VPBB->getFirstNonPhi(); for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (ToSkip.contains(&R) || isa(&R)) continue; // Add all VPValues for all parts to ComputeReductionResult which combines // the parts to compute the final reduction value. VPValue *Op1; if (match(&R, m_VPInstruction( m_VPValue(), m_VPValue(Op1)))) { addUniformForAllParts(cast(&R)); for (unsigned Part = 1; Part != UF; ++Part) R.addOperand(getValueForPart(Op1, Part)); continue; } VPValue *Op0; if (match(&R, m_VPInstruction( m_VPValue(Op0), m_VPValue(Op1)))) { addUniformForAllParts(cast(&R)); if (Plan.hasScalarVFOnly()) { // Extracting from end with VF = 1 implies retrieving the scalar part UF // - Op1. unsigned Offset = cast(Op1->getLiveInIRValue())->getZExtValue(); R.getVPSingleValue()->replaceAllUsesWith( getValueForPart(Op0, UF - Offset)); R.eraseFromParent(); } else { // Otherwise we extract from the last part. remapOperands(&R, UF - 1); } continue; } auto *SingleDef = dyn_cast(&R); if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { addUniformForAllParts(SingleDef); continue; } if (auto *H = dyn_cast(&R)) { unrollHeaderPHIByUF(H, InsertPtForPhi); continue; } unrollRecipeByUF(R); } } void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { assert(UF > 0 && "Unroll factor must be positive"); Plan.setUF(UF); auto Cleanup = make_scope_exit([&Plan]() { auto Iter = vp_depth_first_deep(Plan.getEntry()); // Remove recipes that are redundant after unrolling. for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *VPI = dyn_cast(&R); if (VPI && VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && VPI->getNumOperands() == 1) { VPI->replaceAllUsesWith(VPI->getOperand(0)); VPI->eraseFromParent(); } } } }); if (UF == 1) { return; } UnrollState Unroller(Plan, UF, Ctx); // Iterate over all blocks in the plan starting from Entry, and unroll // recipes inside them. This includes the vector preheader and middle blocks, // which may set up or post-process per-part values. ReversePostOrderTraversal> RPOT( Plan.getEntry()); for (VPBlockBase *VPB : RPOT) Unroller.unrollBlock(VPB); unsigned Part = 1; // Remap operands of cloned header phis to update backedge values. The header // phis cloned during unrolling are just after the header phi for part 0. // Reset Part to 1 when reaching the first (part 0) recipe of a block. for (VPRecipeBase &H : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { // The second operand of Fixed Order Recurrence phi's, feeding the spliced // value across the backedge, needs to remap to the last part of the spliced // value. if (isa(&H)) { Unroller.remapOperand(&H, 1, UF - 1); continue; } if (Unroller.contains(H.getVPSingleValue()) || isa(&H)) { Part = 1; continue; } Unroller.remapOperands(&H, Part); Part++; } VPlanTransforms::removeDeadRecipes(Plan); }