1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements explicit unrolling for VPlans. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPRecipeBuilder.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanCFG.h" 18 #include "VPlanPatternMatch.h" 19 #include "VPlanTransforms.h" 20 #include "VPlanUtils.h" 21 #include "llvm/ADT/PostOrderIterator.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/ScopeExit.h" 24 #include "llvm/Analysis/IVDescriptors.h" 25 #include "llvm/IR/Intrinsics.h" 26 27 using namespace llvm; 28 29 namespace { 30 31 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by 32 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate 33 /// the unrolling transformation, where the original VPValues are retained for 34 /// part zero. 35 class UnrollState { 36 /// Plan to unroll. 37 VPlan &Plan; 38 /// Unroll factor to unroll by. 39 const unsigned UF; 40 /// Analysis for types. 41 VPTypeAnalysis TypeInfo; 42 43 /// Unrolling may create recipes that should not be unrolled themselves. 44 /// Those are tracked in ToSkip. 45 SmallPtrSet<VPRecipeBase *, 8> ToSkip; 46 47 // Associate with each VPValue of part 0 its unrolled instances of parts 1, 48 // ..., UF-1. 49 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; 50 51 /// Unroll replicate region \p VPR by cloning the region UF - 1 times. 52 void unrollReplicateRegionByUF(VPRegionBlock *VPR); 53 54 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across 55 /// all parts. 56 void unrollRecipeByUF(VPRecipeBase &R); 57 58 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled 59 /// depends on the concrete header phi. Inserts newly created recipes at \p 60 /// InsertPtForPhi. 61 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 62 VPBasicBlock::iterator InsertPtForPhi); 63 64 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute 65 /// the induction steps for each part. 66 void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, 67 VPBasicBlock::iterator InsertPtForPhi); 68 69 VPValue *getConstantVPV(unsigned Part) { 70 Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); 71 return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); 72 } 73 74 public: 75 UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) 76 : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} 77 78 void unrollBlock(VPBlockBase *VPB); 79 80 VPValue *getValueForPart(VPValue *V, unsigned Part) { 81 if (Part == 0 || V->isLiveIn()) 82 return V; 83 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && 84 "accessed value does not exist"); 85 return VPV2Parts[V][Part - 1]; 86 } 87 88 /// Given a single original recipe \p OrigR (of part zero), and its copy \p 89 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its 90 /// corresponding VPValue defined by \p CopyR. 91 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, 92 unsigned Part) { 93 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { 94 auto Ins = VPV2Parts.insert({VPV, {}}); 95 assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); 96 Ins.first->second.push_back(CopyR->getVPValue(Idx)); 97 } 98 } 99 100 /// Given a uniform recipe \p R, add it for all parts. 101 void addUniformForAllParts(VPSingleDefRecipe *R) { 102 auto Ins = VPV2Parts.insert({R, {}}); 103 assert(Ins.second && "uniform value already added"); 104 for (unsigned Part = 0; Part != UF; ++Part) 105 Ins.first->second.push_back(R); 106 } 107 108 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } 109 110 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part 111 /// \p P. 112 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { 113 auto *Op = R->getOperand(OpIdx); 114 R->setOperand(OpIdx, getValueForPart(Op, Part)); 115 } 116 117 /// Update \p R's operands with their corresponding VPValues for part \p P. 118 void remapOperands(VPRecipeBase *R, unsigned Part) { 119 for (const auto &[OpIdx, Op] : enumerate(R->operands())) 120 R->setOperand(OpIdx, getValueForPart(Op, Part)); 121 } 122 }; 123 } // namespace 124 125 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { 126 VPBlockBase *InsertPt = VPR->getSingleSuccessor(); 127 for (unsigned Part = 1; Part != UF; ++Part) { 128 auto *Copy = VPR->clone(); 129 VPBlockUtils::insertBlockBefore(Copy, InsertPt); 130 131 auto PartI = vp_depth_first_shallow(Copy->getEntry()); 132 auto Part0 = vp_depth_first_shallow(VPR->getEntry()); 133 for (const auto &[PartIVPBB, Part0VPBB] : 134 zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI), 135 VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) { 136 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { 137 remapOperands(&PartIR, Part); 138 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { 139 ScalarIVSteps->addOperand(getConstantVPV(Part)); 140 } 141 142 addRecipeForPart(&Part0R, &PartIR, Part); 143 } 144 } 145 } 146 } 147 148 void UnrollState::unrollWidenInductionByUF( 149 VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { 150 VPBasicBlock *PH = cast<VPBasicBlock>( 151 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); 152 Type *IVTy = TypeInfo.inferScalarType(IV); 153 auto &ID = IV->getInductionDescriptor(); 154 std::optional<FastMathFlags> FMFs; 155 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp())) 156 FMFs = ID.getInductionBinOp()->getFastMathFlags(); 157 158 VPValue *VectorStep = &Plan.getVF(); 159 VPBuilder Builder(PH); 160 if (TypeInfo.inferScalarType(VectorStep) != IVTy) { 161 Instruction::CastOps CastOp = 162 IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc; 163 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); 164 ToSkip.insert(VectorStep->getDefiningRecipe()); 165 } 166 167 VPValue *ScalarStep = IV->getStepValue(); 168 auto *ConstStep = ScalarStep->isLiveIn() 169 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue()) 170 : nullptr; 171 if (!ConstStep || ConstStep->getValue() != 1) { 172 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { 173 ScalarStep = 174 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); 175 ToSkip.insert(ScalarStep->getDefiningRecipe()); 176 } 177 178 unsigned MulOpc = 179 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; 180 VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep}, 181 FMFs, IV->getDebugLoc()); 182 VectorStep = Mul; 183 ToSkip.insert(Mul); 184 } 185 186 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 187 // remains the header phi. Parts > 0 are computed by adding Step to the 188 // previous part. The header phi recipe will get 2 new operands: the step 189 // value for a single part and the last part, used to compute the backedge 190 // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = 191 // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 192 // %Part.1 = %Part.0 + %VectorStep 193 // %Part.2 = %Part.1 + %VectorStep 194 // %Part.3 = %Part.2 + %VectorStep 195 // 196 // The newly added recipes are added to ToSkip to avoid interleaving them 197 // again. 198 VPValue *Prev = IV; 199 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); 200 unsigned AddOpc = 201 IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; 202 for (unsigned Part = 1; Part != UF; ++Part) { 203 std::string Name = 204 Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; 205 206 VPInstruction *Add = Builder.createNaryOp(AddOpc, 207 { 208 Prev, 209 VectorStep, 210 }, 211 FMFs, IV->getDebugLoc(), Name); 212 ToSkip.insert(Add); 213 addRecipeForPart(IV, Add, Part); 214 Prev = Add; 215 } 216 IV->addOperand(VectorStep); 217 IV->addOperand(Prev); 218 } 219 220 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 221 VPBasicBlock::iterator InsertPtForPhi) { 222 // First-order recurrences pass a single vector or scalar through their header 223 // phis, irrespective of interleaving. 224 if (isa<VPFirstOrderRecurrencePHIRecipe>(R)) 225 return; 226 227 // Generate step vectors for each unrolled part. 228 if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) { 229 unrollWidenInductionByUF(IV, InsertPtForPhi); 230 return; 231 } 232 233 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R); 234 if (RdxPhi && RdxPhi->isOrdered()) 235 return; 236 237 auto InsertPt = std::next(R->getIterator()); 238 for (unsigned Part = 1; Part != UF; ++Part) { 239 VPRecipeBase *Copy = R->clone(); 240 Copy->insertBefore(*R->getParent(), InsertPt); 241 addRecipeForPart(R, Copy, Part); 242 if (isa<VPWidenPointerInductionRecipe>(R)) { 243 Copy->addOperand(R); 244 Copy->addOperand(getConstantVPV(Part)); 245 } else if (RdxPhi) { 246 Copy->addOperand(getConstantVPV(Part)); 247 } else { 248 assert(isa<VPActiveLaneMaskPHIRecipe>(R) && 249 "unexpected header phi recipe not needing unrolled part"); 250 } 251 } 252 } 253 254 /// Handle non-header-phi recipes. 255 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { 256 using namespace llvm::VPlanPatternMatch; 257 if (match(&R, m_BranchOnCond(m_VPValue())) || 258 match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) 259 return; 260 261 if (auto *VPI = dyn_cast<VPInstruction>(&R)) { 262 if (vputils::onlyFirstPartUsed(VPI)) { 263 addUniformForAllParts(VPI); 264 return; 265 } 266 } 267 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { 268 if (isa<StoreInst>(RepR->getUnderlyingValue()) && 269 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { 270 // Stores to an invariant address only need to store the last part. 271 remapOperands(&R, UF - 1); 272 return; 273 } 274 if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) { 275 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { 276 addUniformForAllParts(RepR); 277 return; 278 } 279 } 280 } 281 282 // Unroll non-uniform recipes. 283 auto InsertPt = std::next(R.getIterator()); 284 VPBasicBlock &VPBB = *R.getParent(); 285 for (unsigned Part = 1; Part != UF; ++Part) { 286 VPRecipeBase *Copy = R.clone(); 287 Copy->insertBefore(VPBB, InsertPt); 288 addRecipeForPart(&R, Copy, Part); 289 290 VPValue *Op; 291 if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( 292 m_VPValue(), m_VPValue(Op)))) { 293 Copy->setOperand(0, getValueForPart(Op, Part - 1)); 294 Copy->setOperand(1, getValueForPart(Op, Part)); 295 continue; 296 } 297 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) { 298 auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0)); 299 if (Phi->isOrdered()) { 300 auto &Parts = VPV2Parts[Phi]; 301 if (Part == 1) { 302 Parts.clear(); 303 Parts.push_back(Red); 304 } 305 Parts.push_back(Copy->getVPSingleValue()); 306 Phi->setOperand(1, Copy->getVPSingleValue()); 307 } 308 } 309 remapOperands(Copy, Part); 310 311 // Add operand indicating the part to generate code for, to recipes still 312 // requiring it. 313 if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, 314 VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(Copy) || 315 match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( 316 m_VPValue()))) 317 Copy->addOperand(getConstantVPV(Part)); 318 319 if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R)) 320 Copy->setOperand(0, R.getOperand(0)); 321 } 322 } 323 324 using namespace llvm::VPlanPatternMatch; 325 void UnrollState::unrollBlock(VPBlockBase *VPB) { 326 auto *VPR = dyn_cast<VPRegionBlock>(VPB); 327 if (VPR) { 328 if (VPR->isReplicator()) 329 return unrollReplicateRegionByUF(VPR); 330 331 // Traverse blocks in region in RPO to ensure defs are visited before uses 332 // across blocks. 333 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> 334 RPOT(VPR->getEntry()); 335 for (VPBlockBase *VPB : RPOT) 336 unrollBlock(VPB); 337 return; 338 } 339 340 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. 341 auto *VPBB = cast<VPBasicBlock>(VPB); 342 auto InsertPtForPhi = VPBB->getFirstNonPhi(); 343 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 344 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R)) 345 continue; 346 347 // Add all VPValues for all parts to ComputeReductionResult which combines 348 // the parts to compute the final reduction value. 349 VPValue *Op1; 350 if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( 351 m_VPValue(), m_VPValue(Op1)))) { 352 addUniformForAllParts(cast<VPInstruction>(&R)); 353 for (unsigned Part = 1; Part != UF; ++Part) 354 R.addOperand(getValueForPart(Op1, Part)); 355 continue; 356 } 357 VPValue *Op0; 358 if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>( 359 m_VPValue(Op0), m_VPValue(Op1)))) { 360 addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); 361 if (Plan.hasScalarVFOnly()) { 362 // Extracting from end with VF = 1 implies retrieving the scalar part UF 363 // - Op1. 364 unsigned Offset = 365 cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue(); 366 R.getVPSingleValue()->replaceAllUsesWith( 367 getValueForPart(Op0, UF - Offset)); 368 R.eraseFromParent(); 369 } else { 370 // Otherwise we extract from the last part. 371 remapOperands(&R, UF - 1); 372 } 373 continue; 374 } 375 376 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R); 377 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { 378 addUniformForAllParts(SingleDef); 379 continue; 380 } 381 382 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) { 383 unrollHeaderPHIByUF(H, InsertPtForPhi); 384 continue; 385 } 386 387 unrollRecipeByUF(R); 388 } 389 } 390 391 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { 392 assert(UF > 0 && "Unroll factor must be positive"); 393 Plan.setUF(UF); 394 auto Cleanup = make_scope_exit([&Plan]() { 395 auto Iter = vp_depth_first_deep(Plan.getEntry()); 396 // Remove recipes that are redundant after unrolling. 397 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 398 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 399 auto *VPI = dyn_cast<VPInstruction>(&R); 400 if (VPI && 401 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && 402 VPI->getNumOperands() == 1) { 403 VPI->replaceAllUsesWith(VPI->getOperand(0)); 404 VPI->eraseFromParent(); 405 } 406 } 407 } 408 }); 409 if (UF == 1) { 410 return; 411 } 412 413 UnrollState Unroller(Plan, UF, Ctx); 414 415 // Iterate over all blocks in the plan starting from Entry, and unroll 416 // recipes inside them. This includes the vector preheader and middle blocks, 417 // which may set up or post-process per-part values. 418 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( 419 Plan.getEntry()); 420 for (VPBlockBase *VPB : RPOT) 421 Unroller.unrollBlock(VPB); 422 423 unsigned Part = 1; 424 // Remap operands of cloned header phis to update backedge values. The header 425 // phis cloned during unrolling are just after the header phi for part 0. 426 // Reset Part to 1 when reaching the first (part 0) recipe of a block. 427 for (VPRecipeBase &H : 428 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 429 // The second operand of Fixed Order Recurrence phi's, feeding the spliced 430 // value across the backedge, needs to remap to the last part of the spliced 431 // value. 432 if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) { 433 Unroller.remapOperand(&H, 1, UF - 1); 434 continue; 435 } 436 if (Unroller.contains(H.getVPSingleValue()) || 437 isa<VPWidenPointerInductionRecipe>(&H)) { 438 Part = 1; 439 continue; 440 } 441 Unroller.remapOperands(&H, Part); 442 Part++; 443 } 444 445 VPlanTransforms::removeDeadRecipes(Plan); 446 } 447