1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements explicit unrolling for VPlans. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPRecipeBuilder.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanCFG.h" 18 #include "VPlanDominatorTree.h" 19 #include "VPlanPatternMatch.h" 20 #include "VPlanTransforms.h" 21 #include "VPlanUtils.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/ScopeExit.h" 25 #include "llvm/ADT/SetVector.h" 26 #include "llvm/ADT/TypeSwitch.h" 27 #include "llvm/Analysis/IVDescriptors.h" 28 #include "llvm/Analysis/VectorUtils.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/PatternMatch.h" 31 32 using namespace llvm; 33 34 namespace { 35 36 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by 37 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate 38 /// the unrolling transformation, where the original VPValues are retained for 39 /// part zero. 40 class UnrollState { 41 /// Plan to unroll. 42 VPlan &Plan; 43 /// Unroll factor to unroll by. 44 const unsigned UF; 45 /// Analysis for types. 46 VPTypeAnalysis TypeInfo; 47 48 /// Unrolling may create recipes that should not be unrolled themselves. 49 /// Those are tracked in ToSkip. 50 SmallPtrSet<VPRecipeBase *, 8> ToSkip; 51 52 // Associate with each VPValue of part 0 its unrolled instances of parts 1, 53 // ..., UF-1. 54 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; 55 56 /// Unroll replicate region \p VPR by cloning the region UF - 1 times. 57 void unrollReplicateRegionByUF(VPRegionBlock *VPR); 58 59 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across 60 /// all parts. 61 void unrollRecipeByUF(VPRecipeBase &R); 62 63 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled 64 /// depends on the concrete header phi. Inserts newly created recipes at \p 65 /// InsertPtForPhi. 66 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 67 VPBasicBlock::iterator InsertPtForPhi); 68 69 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute 70 /// the induction steps for each part. 71 void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, 72 VPBasicBlock::iterator InsertPtForPhi); 73 74 VPValue *getConstantVPV(unsigned Part) { 75 Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); 76 return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); 77 } 78 79 public: 80 UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) 81 : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} 82 83 void unrollBlock(VPBlockBase *VPB); 84 85 VPValue *getValueForPart(VPValue *V, unsigned Part) { 86 if (Part == 0 || V->isLiveIn()) 87 return V; 88 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && 89 "accessed value does not exist"); 90 return VPV2Parts[V][Part - 1]; 91 } 92 93 /// Given a single original recipe \p OrigR (of part zero), and its copy \p 94 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its 95 /// corresponding VPValue defined by \p CopyR. 96 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, 97 unsigned Part) { 98 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { 99 auto Ins = VPV2Parts.insert({VPV, {}}); 100 assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); 101 Ins.first->second.push_back(CopyR->getVPValue(Idx)); 102 } 103 } 104 105 /// Given a uniform recipe \p R, add it for all parts. 106 void addUniformForAllParts(VPSingleDefRecipe *R) { 107 auto Ins = VPV2Parts.insert({R, {}}); 108 assert(Ins.second && "uniform value already added"); 109 for (unsigned Part = 0; Part != UF; ++Part) 110 Ins.first->second.push_back(R); 111 } 112 113 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } 114 115 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part 116 /// \p P. 117 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { 118 auto *Op = R->getOperand(OpIdx); 119 R->setOperand(OpIdx, getValueForPart(Op, Part)); 120 } 121 122 /// Update \p R's operands with their corresponding VPValues for part \p P. 123 void remapOperands(VPRecipeBase *R, unsigned Part) { 124 for (const auto &[OpIdx, Op] : enumerate(R->operands())) 125 R->setOperand(OpIdx, getValueForPart(Op, Part)); 126 } 127 }; 128 } // namespace 129 130 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { 131 VPBlockBase *InsertPt = VPR->getSingleSuccessor(); 132 for (unsigned Part = 1; Part != UF; ++Part) { 133 auto *Copy = VPR->clone(); 134 VPBlockUtils::insertBlockBefore(Copy, InsertPt); 135 136 auto PartI = vp_depth_first_shallow(Copy->getEntry()); 137 auto Part0 = vp_depth_first_shallow(VPR->getEntry()); 138 for (const auto &[PartIVPBB, Part0VPBB] : 139 zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI), 140 VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) { 141 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { 142 remapOperands(&PartIR, Part); 143 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { 144 ScalarIVSteps->addOperand(getConstantVPV(Part)); 145 } 146 147 addRecipeForPart(&Part0R, &PartIR, Part); 148 } 149 } 150 } 151 } 152 153 void UnrollState::unrollWidenInductionByUF( 154 VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { 155 VPBasicBlock *PH = cast<VPBasicBlock>( 156 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); 157 Type *IVTy = TypeInfo.inferScalarType(IV); 158 auto &ID = IV->getInductionDescriptor(); 159 std::optional<FastMathFlags> FMFs; 160 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp())) 161 FMFs = ID.getInductionBinOp()->getFastMathFlags(); 162 163 VPValue *VectorStep = &Plan.getVF(); 164 VPBuilder Builder(PH); 165 if (TypeInfo.inferScalarType(VectorStep) != IVTy) { 166 Instruction::CastOps CastOp = 167 IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc; 168 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); 169 ToSkip.insert(VectorStep->getDefiningRecipe()); 170 } 171 172 VPValue *ScalarStep = IV->getStepValue(); 173 auto *ConstStep = ScalarStep->isLiveIn() 174 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue()) 175 : nullptr; 176 if (!ConstStep || ConstStep->getZExtValue() != 1) { 177 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { 178 ScalarStep = 179 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); 180 ToSkip.insert(ScalarStep->getDefiningRecipe()); 181 } 182 183 unsigned MulOpc = 184 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; 185 VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep}, 186 FMFs, IV->getDebugLoc()); 187 VectorStep = Mul; 188 ToSkip.insert(Mul); 189 } 190 191 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 192 // remains the header phi. Parts > 0 are computed by adding Step to the 193 // previous part. The header phi recipe will get 2 new operands: the step 194 // value for a single part and the last part, used to compute the backedge 195 // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = 196 // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 197 // %Part.1 = %Part.0 + %VectorStep 198 // %Part.2 = %Part.1 + %VectorStep 199 // %Part.3 = %Part.2 + %VectorStep 200 // 201 // The newly added recipes are added to ToSkip to avoid interleaving them 202 // again. 203 VPValue *Prev = IV; 204 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); 205 unsigned AddOpc = 206 IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; 207 for (unsigned Part = 1; Part != UF; ++Part) { 208 std::string Name = 209 Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; 210 211 VPInstruction *Add = Builder.createNaryOp(AddOpc, 212 { 213 Prev, 214 VectorStep, 215 }, 216 FMFs, IV->getDebugLoc(), Name); 217 ToSkip.insert(Add); 218 addRecipeForPart(IV, Add, Part); 219 Prev = Add; 220 } 221 IV->addOperand(VectorStep); 222 IV->addOperand(Prev); 223 } 224 225 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 226 VPBasicBlock::iterator InsertPtForPhi) { 227 // First-order recurrences pass a single vector or scalar through their header 228 // phis, irrespective of interleaving. 229 if (isa<VPFirstOrderRecurrencePHIRecipe>(R)) 230 return; 231 232 // Generate step vectors for each unrolled part. 233 if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) { 234 unrollWidenInductionByUF(IV, InsertPtForPhi); 235 return; 236 } 237 238 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R); 239 if (RdxPhi && RdxPhi->isOrdered()) 240 return; 241 242 auto InsertPt = std::next(R->getIterator()); 243 for (unsigned Part = 1; Part != UF; ++Part) { 244 VPRecipeBase *Copy = R->clone(); 245 Copy->insertBefore(*R->getParent(), InsertPt); 246 addRecipeForPart(R, Copy, Part); 247 if (isa<VPWidenPointerInductionRecipe>(R)) { 248 Copy->addOperand(R); 249 Copy->addOperand(getConstantVPV(Part)); 250 } else if (RdxPhi) { 251 Copy->addOperand(getConstantVPV(Part)); 252 } else { 253 assert(isa<VPActiveLaneMaskPHIRecipe>(R) && 254 "unexpected header phi recipe not needing unrolled part"); 255 } 256 } 257 } 258 259 /// Handle non-header-phi recipes. 260 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { 261 using namespace llvm::VPlanPatternMatch; 262 if (match(&R, m_BranchOnCond(m_VPValue())) || 263 match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) 264 return; 265 266 if (auto *VPI = dyn_cast<VPInstruction>(&R)) { 267 if (vputils::onlyFirstPartUsed(VPI)) { 268 addUniformForAllParts(VPI); 269 return; 270 } 271 } 272 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { 273 if (isa<StoreInst>(RepR->getUnderlyingValue()) && 274 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { 275 // Stores to an invariant address only need to store the last part. 276 remapOperands(&R, UF - 1); 277 return; 278 } 279 if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) { 280 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { 281 addUniformForAllParts(RepR); 282 return; 283 } 284 } 285 } 286 287 // Unroll non-uniform recipes. 288 auto InsertPt = std::next(R.getIterator()); 289 VPBasicBlock &VPBB = *R.getParent(); 290 for (unsigned Part = 1; Part != UF; ++Part) { 291 VPRecipeBase *Copy = R.clone(); 292 Copy->insertBefore(VPBB, InsertPt); 293 addRecipeForPart(&R, Copy, Part); 294 295 VPValue *Op; 296 if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( 297 m_VPValue(), m_VPValue(Op)))) { 298 Copy->setOperand(0, getValueForPart(Op, Part - 1)); 299 Copy->setOperand(1, getValueForPart(Op, Part)); 300 continue; 301 } 302 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) { 303 auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0)); 304 if (Phi->isOrdered()) { 305 auto Ins = VPV2Parts.insert({Phi, {}}); 306 if (Part == 1) { 307 Ins.first->second.clear(); 308 Ins.first->second.push_back(Red); 309 } 310 Ins.first->second.push_back(Copy->getVPSingleValue()); 311 Phi->setOperand(1, Copy->getVPSingleValue()); 312 } 313 } 314 remapOperands(Copy, Part); 315 316 // Add operand indicating the part to generate code for, to recipes still 317 // requiring it. 318 if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, 319 VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(Copy) || 320 match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( 321 m_VPValue()))) 322 Copy->addOperand(getConstantVPV(Part)); 323 324 if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R)) 325 Copy->setOperand(0, R.getOperand(0)); 326 } 327 } 328 329 using namespace llvm::VPlanPatternMatch; 330 void UnrollState::unrollBlock(VPBlockBase *VPB) { 331 auto *VPR = dyn_cast<VPRegionBlock>(VPB); 332 if (VPR) { 333 if (VPR->isReplicator()) 334 return unrollReplicateRegionByUF(VPR); 335 336 // Traverse blocks in region in RPO to ensure defs are visited before uses 337 // across blocks. 338 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> 339 RPOT(VPR->getEntry()); 340 for (VPBlockBase *VPB : RPOT) 341 unrollBlock(VPB); 342 return; 343 } 344 345 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. 346 auto *VPBB = cast<VPBasicBlock>(VPB); 347 auto InsertPtForPhi = VPBB->getFirstNonPhi(); 348 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 349 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R)) 350 continue; 351 352 // Add all VPValues for all parts to ComputeReductionResult which combines 353 // the parts to compute the final reduction value. 354 VPValue *Op1; 355 if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( 356 m_VPValue(), m_VPValue(Op1)))) { 357 addUniformForAllParts(cast<VPInstruction>(&R)); 358 for (unsigned Part = 1; Part != UF; ++Part) 359 R.addOperand(getValueForPart(Op1, Part)); 360 continue; 361 } 362 VPValue *Op0; 363 if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>( 364 m_VPValue(Op0), m_VPValue(Op1)))) { 365 addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); 366 if (Plan.hasScalarVFOnly()) { 367 // Extracting from end with VF = 1 implies retrieving the scalar part UF 368 // - Op1. 369 unsigned Offset = 370 cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue(); 371 R.getVPSingleValue()->replaceAllUsesWith( 372 getValueForPart(Op0, UF - Offset)); 373 R.eraseFromParent(); 374 } else { 375 // Otherwise we extract from the last part. 376 remapOperands(&R, UF - 1); 377 } 378 continue; 379 } 380 381 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R); 382 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { 383 addUniformForAllParts(SingleDef); 384 continue; 385 } 386 387 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) { 388 unrollHeaderPHIByUF(H, InsertPtForPhi); 389 continue; 390 } 391 392 unrollRecipeByUF(R); 393 } 394 } 395 396 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { 397 assert(UF > 0 && "Unroll factor must be positive"); 398 Plan.setUF(UF); 399 auto Cleanup = make_scope_exit([&Plan]() { 400 auto Iter = vp_depth_first_deep(Plan.getEntry()); 401 // Remove recipes that are redundant after unrolling. 402 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 403 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 404 auto *VPI = dyn_cast<VPInstruction>(&R); 405 if (VPI && 406 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && 407 VPI->getNumOperands() == 1) { 408 VPI->replaceAllUsesWith(VPI->getOperand(0)); 409 VPI->eraseFromParent(); 410 } 411 } 412 } 413 }); 414 if (UF == 1) { 415 return; 416 } 417 418 UnrollState Unroller(Plan, UF, Ctx); 419 420 Unroller.unrollBlock(Plan.getPreheader()); 421 422 // Iterate over all blocks in the plan starting from Entry, and unroll 423 // recipes inside them. This includes the vector preheader and middle blocks, 424 // which may set up or post-process per-part values. 425 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( 426 Plan.getEntry()); 427 for (VPBlockBase *VPB : RPOT) 428 Unroller.unrollBlock(VPB); 429 430 unsigned Part = 1; 431 // Remap operands of cloned header phis to update backedge values. The header 432 // phis cloned during unrolling are just after the header phi for part 0. 433 // Reset Part to 1 when reaching the first (part 0) recipe of a block. 434 for (VPRecipeBase &H : 435 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 436 // The second operand of Fixed Order Recurrence phi's, feeding the spliced 437 // value across the backedge, needs to remap to the last part of the spliced 438 // value. 439 if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) { 440 Unroller.remapOperand(&H, 1, UF - 1); 441 continue; 442 } 443 if (Unroller.contains(H.getVPSingleValue()) || 444 isa<VPWidenPointerInductionRecipe>(&H)) { 445 Part = 1; 446 continue; 447 } 448 Unroller.remapOperands(&H, Part); 449 Part++; 450 } 451 452 // Remap the operand of live-outs to the last part. 453 for (const auto &[_, LO] : Plan.getLiveOuts()) { 454 VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1); 455 LO->setOperand(0, In); 456 } 457 458 VPlanTransforms::removeDeadRecipes(Plan); 459 } 460