1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements explicit unrolling for VPlans. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPRecipeBuilder.h" 15 #include "VPlan.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanCFG.h" 18 #include "VPlanDominatorTree.h" 19 #include "VPlanPatternMatch.h" 20 #include "VPlanTransforms.h" 21 #include "VPlanUtils.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/ScopeExit.h" 25 #include "llvm/ADT/SetVector.h" 26 #include "llvm/ADT/TypeSwitch.h" 27 #include "llvm/Analysis/IVDescriptors.h" 28 #include "llvm/Analysis/VectorUtils.h" 29 #include "llvm/IR/Intrinsics.h" 30 #include "llvm/IR/PatternMatch.h" 31 32 using namespace llvm; 33 34 namespace { 35 36 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by 37 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate 38 /// the unrolling transformation, where the original VPValues are retained for 39 /// part zero. 40 class UnrollState { 41 /// Plan to unroll. 42 VPlan &Plan; 43 /// Unroll factor to unroll by. 44 const unsigned UF; 45 /// Analysis for types. 46 VPTypeAnalysis TypeInfo; 47 48 /// Unrolling may create recipes that should not be unrolled themselves. 49 /// Those are tracked in ToSkip. 50 SmallPtrSet<VPRecipeBase *, 8> ToSkip; 51 52 // Associate with each VPValue of part 0 its unrolled instances of parts 1, 53 // ..., UF-1. 54 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; 55 56 /// Unroll replicate region \p VPR by cloning the region UF - 1 times. 57 void unrollReplicateRegionByUF(VPRegionBlock *VPR); 58 59 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across 60 /// all parts. 61 void unrollRecipeByUF(VPRecipeBase &R); 62 63 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled 64 /// depends on the concrete header phi. Inserts newly created recipes at \p 65 /// InsertPtForPhi. 66 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 67 VPBasicBlock::iterator InsertPtForPhi); 68 69 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute 70 /// the induction steps for each part. 71 void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, 72 VPBasicBlock::iterator InsertPtForPhi); 73 74 VPValue *getConstantVPV(unsigned Part) { 75 Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); 76 return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part)); 77 } 78 79 public: 80 UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) 81 : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} 82 83 void unrollBlock(VPBlockBase *VPB); 84 85 VPValue *getValueForPart(VPValue *V, unsigned Part) { 86 if (Part == 0 || V->isLiveIn()) 87 return V; 88 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && 89 "accessed value does not exist"); 90 return VPV2Parts[V][Part - 1]; 91 } 92 93 /// Given a single original recipe \p OrigR (of part zero), and its copy \p 94 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its 95 /// corresponding VPValue defined by \p CopyR. 96 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, 97 unsigned Part) { 98 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) { 99 auto Ins = VPV2Parts.insert({VPV, {}}); 100 assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); 101 Ins.first->second.push_back(CopyR->getVPValue(Idx)); 102 } 103 } 104 105 /// Given a uniform recipe \p R, add it for all parts. 106 void addUniformForAllParts(VPSingleDefRecipe *R) { 107 auto Ins = VPV2Parts.insert({R, {}}); 108 assert(Ins.second && "uniform value already added"); 109 for (unsigned Part = 0; Part != UF; ++Part) 110 Ins.first->second.push_back(R); 111 } 112 113 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); } 114 115 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part 116 /// \p P. 117 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { 118 auto *Op = R->getOperand(OpIdx); 119 R->setOperand(OpIdx, getValueForPart(Op, Part)); 120 } 121 122 /// Update \p R's operands with their corresponding VPValues for part \p P. 123 void remapOperands(VPRecipeBase *R, unsigned Part) { 124 for (const auto &[OpIdx, Op] : enumerate(R->operands())) 125 R->setOperand(OpIdx, getValueForPart(Op, Part)); 126 } 127 }; 128 } // namespace 129 130 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { 131 VPBlockBase *InsertPt = VPR->getSingleSuccessor(); 132 for (unsigned Part = 1; Part != UF; ++Part) { 133 auto *Copy = VPR->clone(); 134 VPBlockUtils::insertBlockBefore(Copy, InsertPt); 135 136 auto PartI = vp_depth_first_shallow(Copy->getEntry()); 137 auto Part0 = vp_depth_first_shallow(VPR->getEntry()); 138 for (const auto &[PartIVPBB, Part0VPBB] : 139 zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI), 140 VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) { 141 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { 142 remapOperands(&PartIR, Part); 143 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { 144 ScalarIVSteps->addOperand(getConstantVPV(Part)); 145 } 146 147 addRecipeForPart(&Part0R, &PartIR, Part); 148 } 149 } 150 } 151 } 152 153 void UnrollState::unrollWidenInductionByUF( 154 VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { 155 VPBasicBlock *PH = cast<VPBasicBlock>( 156 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); 157 Type *IVTy = TypeInfo.inferScalarType(IV); 158 auto &ID = IV->getInductionDescriptor(); 159 std::optional<FastMathFlags> FMFs; 160 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp())) 161 FMFs = ID.getInductionBinOp()->getFastMathFlags(); 162 163 VPValue *VectorStep = &Plan.getVF(); 164 VPBuilder Builder(PH); 165 if (TypeInfo.inferScalarType(VectorStep) != IVTy) { 166 Instruction::CastOps CastOp = 167 IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc; 168 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy); 169 ToSkip.insert(VectorStep->getDefiningRecipe()); 170 } 171 172 VPValue *ScalarStep = IV->getStepValue(); 173 auto *ConstStep = ScalarStep->isLiveIn() 174 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue()) 175 : nullptr; 176 if (!ConstStep || ConstStep->getZExtValue() != 1) { 177 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) { 178 ScalarStep = 179 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy); 180 ToSkip.insert(ScalarStep->getDefiningRecipe()); 181 } 182 183 unsigned MulOpc = 184 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul; 185 VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep}, 186 FMFs, IV->getDebugLoc()); 187 VectorStep = Mul; 188 ToSkip.insert(Mul); 189 } 190 191 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 192 // remains the header phi. Parts > 0 are computed by adding Step to the 193 // previous part. The header phi recipe will get 2 new operands: the step 194 // value for a single part and the last part, used to compute the backedge 195 // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = 196 // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 197 // %Part.1 = %Part.0 + %VectorStep 198 // %Part.2 = %Part.1 + %VectorStep 199 // %Part.3 = %Part.2 + %VectorStep 200 // 201 // The newly added recipes are added to ToSkip to avoid interleaving them 202 // again. 203 VPValue *Prev = IV; 204 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi); 205 unsigned AddOpc = 206 IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; 207 for (unsigned Part = 1; Part != UF; ++Part) { 208 std::string Name = 209 Part > 1 ? "step.add." + std::to_string(Part) : "step.add"; 210 211 VPInstruction *Add = Builder.createNaryOp(AddOpc, 212 { 213 Prev, 214 VectorStep, 215 }, 216 FMFs, IV->getDebugLoc(), Name); 217 ToSkip.insert(Add); 218 addRecipeForPart(IV, Add, Part); 219 Prev = Add; 220 } 221 IV->addOperand(VectorStep); 222 IV->addOperand(Prev); 223 } 224 225 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, 226 VPBasicBlock::iterator InsertPtForPhi) { 227 // First-order recurrences pass a single vector or scalar through their header 228 // phis, irrespective of interleaving. 229 if (isa<VPFirstOrderRecurrencePHIRecipe>(R)) 230 return; 231 232 // Generate step vectors for each unrolled part. 233 if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) { 234 unrollWidenInductionByUF(IV, InsertPtForPhi); 235 return; 236 } 237 238 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R); 239 if (RdxPhi && RdxPhi->isOrdered()) 240 return; 241 242 auto InsertPt = std::next(R->getIterator()); 243 for (unsigned Part = 1; Part != UF; ++Part) { 244 VPRecipeBase *Copy = R->clone(); 245 Copy->insertBefore(*R->getParent(), InsertPt); 246 addRecipeForPart(R, Copy, Part); 247 if (isa<VPWidenPointerInductionRecipe>(R)) { 248 Copy->addOperand(R); 249 Copy->addOperand(getConstantVPV(Part)); 250 } else if (RdxPhi) { 251 Copy->addOperand(getConstantVPV(Part)); 252 } else { 253 assert(isa<VPActiveLaneMaskPHIRecipe>(R) && 254 "unexpected header phi recipe not needing unrolled part"); 255 } 256 } 257 } 258 259 /// Handle non-header-phi recipes. 260 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { 261 using namespace llvm::VPlanPatternMatch; 262 if (match(&R, m_BranchOnCond(m_VPValue())) || 263 match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) 264 return; 265 266 if (auto *VPI = dyn_cast<VPInstruction>(&R)) { 267 VPValue *Op0, *Op1; 268 if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>( 269 m_VPValue(Op0), m_VPValue(Op1)))) { 270 VPI->setOperand(1, getValueForPart(Op1, UF - 1)); 271 addUniformForAllParts(VPI); 272 if (Plan.hasScalarVFOnly()) { 273 // Extracting from end with VF = 1 implies retrieving the scalar part UF 274 // - Op1. 275 unsigned Offset = 276 cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue(); 277 VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset)); 278 } else { 279 // Otherwise we extract from the last part. 280 remapOperands(VPI, UF - 1); 281 } 282 return; 283 } 284 285 if (vputils::onlyFirstPartUsed(VPI)) { 286 addUniformForAllParts(VPI); 287 return; 288 } 289 } 290 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { 291 if (isa<StoreInst>(RepR->getUnderlyingValue()) && 292 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) { 293 // Stores to an invariant address only need to store the last part. 294 remapOperands(&R, UF - 1); 295 return; 296 } 297 if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) { 298 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { 299 addUniformForAllParts(RepR); 300 return; 301 } 302 } 303 } 304 305 // Unroll non-uniform recipes. 306 auto InsertPt = std::next(R.getIterator()); 307 VPBasicBlock &VPBB = *R.getParent(); 308 for (unsigned Part = 1; Part != UF; ++Part) { 309 VPRecipeBase *Copy = R.clone(); 310 Copy->insertBefore(VPBB, InsertPt); 311 addRecipeForPart(&R, Copy, Part); 312 313 VPValue *Op; 314 if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( 315 m_VPValue(), m_VPValue(Op)))) { 316 Copy->setOperand(0, getValueForPart(Op, Part - 1)); 317 Copy->setOperand(1, getValueForPart(Op, Part)); 318 continue; 319 } 320 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) { 321 auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0)); 322 if (Phi->isOrdered()) { 323 auto Ins = VPV2Parts.insert({Phi, {}}); 324 if (Part == 1) { 325 Ins.first->second.clear(); 326 Ins.first->second.push_back(Red); 327 } 328 Ins.first->second.push_back(Copy->getVPSingleValue()); 329 Phi->setOperand(1, Copy->getVPSingleValue()); 330 } 331 } 332 remapOperands(Copy, Part); 333 334 // Add operand indicating the part to generate code for, to recipes still 335 // requiring it. 336 if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, 337 VPVectorPointerRecipe>(Copy) || 338 match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( 339 m_VPValue()))) 340 Copy->addOperand(getConstantVPV(Part)); 341 342 if (isa<VPVectorPointerRecipe>(R)) 343 Copy->setOperand(0, R.getOperand(0)); 344 } 345 } 346 347 using namespace llvm::VPlanPatternMatch; 348 void UnrollState::unrollBlock(VPBlockBase *VPB) { 349 auto *VPR = dyn_cast<VPRegionBlock>(VPB); 350 if (VPR) { 351 if (VPR->isReplicator()) 352 return unrollReplicateRegionByUF(VPR); 353 354 // Traverse blocks in region in RPO to ensure defs are visited before uses 355 // across blocks. 356 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> 357 RPOT(VPR->getEntry()); 358 for (VPBlockBase *VPB : RPOT) 359 unrollBlock(VPB); 360 return; 361 } 362 363 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. 364 auto *VPBB = cast<VPBasicBlock>(VPB); 365 auto InsertPtForPhi = VPBB->getFirstNonPhi(); 366 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 367 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R)) 368 continue; 369 370 // Add all VPValues for all parts to ComputeReductionResult which combines 371 // the parts to compute the final reduction value. 372 VPValue *Op1; 373 if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>( 374 m_VPValue(), m_VPValue(Op1)))) { 375 addUniformForAllParts(cast<VPInstruction>(&R)); 376 for (unsigned Part = 1; Part != UF; ++Part) 377 R.addOperand(getValueForPart(Op1, Part)); 378 continue; 379 } 380 VPValue *Op0; 381 if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>( 382 m_VPValue(Op0), m_VPValue(Op1)))) { 383 addUniformForAllParts(cast<VPSingleDefRecipe>(&R)); 384 if (Plan.hasScalarVFOnly()) { 385 // Extracting from end with VF = 1 implies retrieving the scalar part UF 386 // - Op1. 387 unsigned Offset = 388 cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue(); 389 R.getVPSingleValue()->replaceAllUsesWith( 390 getValueForPart(Op0, UF - Offset)); 391 R.eraseFromParent(); 392 } else { 393 // Otherwise we extract from the last part. 394 remapOperands(&R, UF - 1); 395 } 396 continue; 397 } 398 399 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R); 400 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) { 401 addUniformForAllParts(SingleDef); 402 continue; 403 } 404 405 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) { 406 unrollHeaderPHIByUF(H, InsertPtForPhi); 407 continue; 408 } 409 410 unrollRecipeByUF(R); 411 } 412 } 413 414 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { 415 assert(UF > 0 && "Unroll factor must be positive"); 416 Plan.setUF(UF); 417 auto Cleanup = make_scope_exit([&Plan]() { 418 auto Iter = vp_depth_first_deep(Plan.getEntry()); 419 // Remove recipes that are redundant after unrolling. 420 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 421 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 422 auto *VPI = dyn_cast<VPInstruction>(&R); 423 if (VPI && 424 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && 425 VPI->getNumOperands() == 1) { 426 VPI->replaceAllUsesWith(VPI->getOperand(0)); 427 VPI->eraseFromParent(); 428 } 429 } 430 } 431 }); 432 if (UF == 1) { 433 return; 434 } 435 436 UnrollState Unroller(Plan, UF, Ctx); 437 438 Unroller.unrollBlock(Plan.getPreheader()); 439 440 // Iterate over all blocks in the plan starting from Entry, and unroll 441 // recipes inside them. This includes the vector preheader and middle blocks, 442 // which may set up or post-process per-part values. 443 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( 444 Plan.getEntry()); 445 for (VPBlockBase *VPB : RPOT) 446 Unroller.unrollBlock(VPB); 447 448 unsigned Part = 1; 449 // Remap operands of cloned header phis to update backedge values. The header 450 // phis cloned during unrolling are just after the header phi for part 0. 451 // Reset Part to 1 when reaching the first (part 0) recipe of a block. 452 for (VPRecipeBase &H : 453 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 454 // The second operand of Fixed Order Recurrence phi's, feeding the spliced 455 // value across the backedge, needs to remap to the last part of the spliced 456 // value. 457 if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) { 458 Unroller.remapOperand(&H, 1, UF - 1); 459 continue; 460 } 461 if (Unroller.contains(H.getVPSingleValue()) || 462 isa<VPWidenPointerInductionRecipe>(&H)) { 463 Part = 1; 464 continue; 465 } 466 Unroller.remapOperands(&H, Part); 467 Part++; 468 } 469 470 // Remap the operand of live-outs to the last part. 471 for (const auto &[_, LO] : Plan.getLiveOuts()) { 472 VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1); 473 LO->setOperand(0, In); 474 } 475 476 VPlanTransforms::removeDeadRecipes(Plan); 477 } 478