1 //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements a set of utility VPlan to VPlan transformations. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "VPlanTransforms.h" 15 #include "VPRecipeBuilder.h" 16 #include "VPlanAnalysis.h" 17 #include "VPlanCFG.h" 18 #include "VPlanDominatorTree.h" 19 #include "llvm/ADT/PostOrderIterator.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SetVector.h" 22 #include "llvm/Analysis/IVDescriptors.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/IR/PatternMatch.h" 26 27 using namespace llvm; 28 29 using namespace llvm::PatternMatch; 30 31 void VPlanTransforms::VPInstructionsToVPRecipes( 32 VPlanPtr &Plan, 33 function_ref<const InductionDescriptor *(PHINode *)> 34 GetIntOrFpInductionDescriptor, 35 ScalarEvolution &SE, const TargetLibraryInfo &TLI) { 36 37 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 38 Plan->getEntry()); 39 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { 40 VPRecipeBase *Term = VPBB->getTerminator(); 41 auto EndIter = Term ? Term->getIterator() : VPBB->end(); 42 // Introduce each ingredient into VPlan. 43 for (VPRecipeBase &Ingredient : 44 make_early_inc_range(make_range(VPBB->begin(), EndIter))) { 45 46 VPValue *VPV = Ingredient.getVPSingleValue(); 47 Instruction *Inst = cast<Instruction>(VPV->getUnderlyingValue()); 48 49 VPRecipeBase *NewRecipe = nullptr; 50 if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { 51 auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); 52 if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { 53 VPValue *Start = Plan->getVPValueOrAddLiveIn(II->getStartValue()); 54 VPValue *Step = 55 vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); 56 NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); 57 } else { 58 Plan->addVPValue(Phi, VPPhi); 59 continue; 60 } 61 } else { 62 assert(isa<VPInstruction>(&Ingredient) && 63 "only VPInstructions expected here"); 64 assert(!isa<PHINode>(Inst) && "phis should be handled above"); 65 // Create VPWidenMemoryInstructionRecipe for loads and stores. 66 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) { 67 NewRecipe = new VPWidenMemoryInstructionRecipe( 68 *Load, Ingredient.getOperand(0), nullptr /*Mask*/, 69 false /*Consecutive*/, false /*Reverse*/); 70 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) { 71 NewRecipe = new VPWidenMemoryInstructionRecipe( 72 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), 73 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); 74 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { 75 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); 76 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) { 77 NewRecipe = 78 new VPWidenCallRecipe(*CI, drop_end(Ingredient.operands()), 79 getVectorIntrinsicIDForCall(CI, &TLI)); 80 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) { 81 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); 82 } else if (auto *CI = dyn_cast<CastInst>(Inst)) { 83 NewRecipe = new VPWidenCastRecipe( 84 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI); 85 } else { 86 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); 87 } 88 } 89 90 NewRecipe->insertBefore(&Ingredient); 91 if (NewRecipe->getNumDefinedValues() == 1) 92 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue()); 93 else 94 assert(NewRecipe->getNumDefinedValues() == 0 && 95 "Only recpies with zero or one defined values expected"); 96 Ingredient.eraseFromParent(); 97 } 98 } 99 } 100 101 static bool sinkScalarOperands(VPlan &Plan) { 102 auto Iter = vp_depth_first_deep(Plan.getEntry()); 103 bool Changed = false; 104 // First, collect the operands of all recipes in replicate blocks as seeds for 105 // sinking. 106 SetVector<std::pair<VPBasicBlock *, VPRecipeBase *>> WorkList; 107 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { 108 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); 109 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) 110 continue; 111 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]); 112 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) 113 continue; 114 for (auto &Recipe : *VPBB) { 115 for (VPValue *Op : Recipe.operands()) 116 if (auto *Def = Op->getDefiningRecipe()) 117 WorkList.insert(std::make_pair(VPBB, Def)); 118 } 119 } 120 121 bool ScalarVFOnly = Plan.hasScalarVFOnly(); 122 // Try to sink each replicate or scalar IV steps recipe in the worklist. 123 for (unsigned I = 0; I != WorkList.size(); ++I) { 124 VPBasicBlock *SinkTo; 125 VPRecipeBase *SinkCandidate; 126 std::tie(SinkTo, SinkCandidate) = WorkList[I]; 127 if (SinkCandidate->getParent() == SinkTo || 128 SinkCandidate->mayHaveSideEffects() || 129 SinkCandidate->mayReadOrWriteMemory()) 130 continue; 131 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) { 132 if (!ScalarVFOnly && RepR->isUniform()) 133 continue; 134 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate)) 135 continue; 136 137 bool NeedsDuplicating = false; 138 // All recipe users of the sink candidate must be in the same block SinkTo 139 // or all users outside of SinkTo must be uniform-after-vectorization ( 140 // i.e., only first lane is used) . In the latter case, we need to duplicate 141 // SinkCandidate. 142 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, 143 SinkCandidate](VPUser *U) { 144 auto *UI = dyn_cast<VPRecipeBase>(U); 145 if (!UI) 146 return false; 147 if (UI->getParent() == SinkTo) 148 return true; 149 NeedsDuplicating = 150 UI->onlyFirstLaneUsed(SinkCandidate->getVPSingleValue()); 151 // We only know how to duplicate VPRecipeRecipes for now. 152 return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate); 153 }; 154 if (!all_of(SinkCandidate->getVPSingleValue()->users(), CanSinkWithUser)) 155 continue; 156 157 if (NeedsDuplicating) { 158 if (ScalarVFOnly) 159 continue; 160 Instruction *I = cast<Instruction>( 161 cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue()); 162 auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true); 163 // TODO: add ".cloned" suffix to name of Clone's VPValue. 164 165 Clone->insertBefore(SinkCandidate); 166 SinkCandidate->getVPSingleValue()->replaceUsesWithIf( 167 Clone, [SinkTo](VPUser &U, unsigned) { 168 return cast<VPRecipeBase>(&U)->getParent() != SinkTo; 169 }); 170 } 171 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); 172 for (VPValue *Op : SinkCandidate->operands()) 173 if (auto *Def = Op->getDefiningRecipe()) 174 WorkList.insert(std::make_pair(SinkTo, Def)); 175 Changed = true; 176 } 177 return Changed; 178 } 179 180 /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return 181 /// the mask. 182 VPValue *getPredicatedMask(VPRegionBlock *R) { 183 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry()); 184 if (!EntryBB || EntryBB->size() != 1 || 185 !isa<VPBranchOnMaskRecipe>(EntryBB->begin())) 186 return nullptr; 187 188 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0); 189 } 190 191 /// If \p R is a triangle region, return the 'then' block of the triangle. 192 static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { 193 auto *EntryBB = cast<VPBasicBlock>(R->getEntry()); 194 if (EntryBB->getNumSuccessors() != 2) 195 return nullptr; 196 197 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]); 198 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]); 199 if (!Succ0 || !Succ1) 200 return nullptr; 201 202 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) 203 return nullptr; 204 if (Succ0->getSingleSuccessor() == Succ1) 205 return Succ0; 206 if (Succ1->getSingleSuccessor() == Succ0) 207 return Succ1; 208 return nullptr; 209 } 210 211 // Merge replicate regions in their successor region, if a replicate region 212 // is connected to a successor replicate region with the same predicate by a 213 // single, empty VPBasicBlock. 214 static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { 215 SetVector<VPRegionBlock *> DeletedRegions; 216 217 // Collect replicate regions followed by an empty block, followed by another 218 // replicate region with matching masks to process front. This is to avoid 219 // iterator invalidation issues while merging regions. 220 SmallVector<VPRegionBlock *, 8> WorkList; 221 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>( 222 vp_depth_first_deep(Plan.getEntry()))) { 223 if (!Region1->isReplicator()) 224 continue; 225 auto *MiddleBasicBlock = 226 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor()); 227 if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) 228 continue; 229 230 auto *Region2 = 231 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); 232 if (!Region2 || !Region2->isReplicator()) 233 continue; 234 235 VPValue *Mask1 = getPredicatedMask(Region1); 236 VPValue *Mask2 = getPredicatedMask(Region2); 237 if (!Mask1 || Mask1 != Mask2) 238 continue; 239 240 assert(Mask1 && Mask2 && "both region must have conditions"); 241 WorkList.push_back(Region1); 242 } 243 244 // Move recipes from Region1 to its successor region, if both are triangles. 245 for (VPRegionBlock *Region1 : WorkList) { 246 if (DeletedRegions.contains(Region1)) 247 continue; 248 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor()); 249 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); 250 251 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1); 252 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2); 253 if (!Then1 || !Then2) 254 continue; 255 256 // Note: No fusion-preventing memory dependencies are expected in either 257 // region. Such dependencies should be rejected during earlier dependence 258 // checks, which guarantee accesses can be re-ordered for vectorization. 259 // 260 // Move recipes to the successor region. 261 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1))) 262 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi()); 263 264 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor()); 265 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor()); 266 267 // Move VPPredInstPHIRecipes from the merge block to the successor region's 268 // merge block. Update all users inside the successor region to use the 269 // original values. 270 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) { 271 VPValue *PredInst1 = 272 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); 273 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); 274 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { 275 auto *UI = dyn_cast<VPRecipeBase>(&U); 276 return UI && UI->getParent() == Then2; 277 }); 278 279 Phi1ToMove.moveBefore(*Merge2, Merge2->begin()); 280 } 281 282 // Finally, remove the first region. 283 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) { 284 VPBlockUtils::disconnectBlocks(Pred, Region1); 285 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); 286 } 287 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); 288 DeletedRegions.insert(Region1); 289 } 290 291 for (VPRegionBlock *ToDelete : DeletedRegions) 292 delete ToDelete; 293 return !DeletedRegions.empty(); 294 } 295 296 static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, 297 VPlan &Plan) { 298 Instruction *Instr = PredRecipe->getUnderlyingInstr(); 299 // Build the triangular if-then region. 300 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 301 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 302 auto *BlockInMask = PredRecipe->getMask(); 303 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 304 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 305 306 // Replace predicated replicate recipe with a replicate recipe without a 307 // mask but in the replicate region. 308 auto *RecipeWithoutMask = new VPReplicateRecipe( 309 PredRecipe->getUnderlyingInstr(), 310 make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), 311 PredRecipe->isUniform()); 312 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); 313 314 VPPredInstPHIRecipe *PHIRecipe = nullptr; 315 if (PredRecipe->getNumUsers() != 0) { 316 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); 317 PredRecipe->replaceAllUsesWith(PHIRecipe); 318 PHIRecipe->setOperand(0, RecipeWithoutMask); 319 } 320 PredRecipe->eraseFromParent(); 321 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 322 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 323 324 // Note: first set Entry as region entry and then connect successors starting 325 // from it in order, to propagate the "parent" of each VPBasicBlock. 326 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 327 VPBlockUtils::connectBlocks(Pred, Exiting); 328 329 return Region; 330 } 331 332 static void addReplicateRegions(VPlan &Plan) { 333 SmallVector<VPReplicateRecipe *> WorkList; 334 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 335 vp_depth_first_deep(Plan.getEntry()))) { 336 for (VPRecipeBase &R : *VPBB) 337 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) { 338 if (RepR->isPredicated()) 339 WorkList.push_back(RepR); 340 } 341 } 342 343 unsigned BBNum = 0; 344 for (VPReplicateRecipe *RepR : WorkList) { 345 VPBasicBlock *CurrentBlock = RepR->getParent(); 346 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator()); 347 348 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); 349 SplitBlock->setName( 350 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : ""); 351 // Record predicated instructions for above packing optimizations. 352 VPBlockBase *Region = createReplicateRegion(RepR, Plan); 353 Region->setParent(CurrentBlock->getParent()); 354 VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock); 355 VPBlockUtils::connectBlocks(CurrentBlock, Region); 356 VPBlockUtils::connectBlocks(Region, SplitBlock); 357 } 358 } 359 360 void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { 361 // Convert masked VPReplicateRecipes to if-then region blocks. 362 addReplicateRegions(Plan); 363 364 bool ShouldSimplify = true; 365 while (ShouldSimplify) { 366 ShouldSimplify = sinkScalarOperands(Plan); 367 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); 368 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(Plan); 369 } 370 } 371 bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) { 372 SmallVector<VPBasicBlock *> WorkList; 373 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 374 vp_depth_first_deep(Plan.getEntry()))) { 375 auto *PredVPBB = 376 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor()); 377 if (PredVPBB && PredVPBB->getNumSuccessors() == 1) 378 WorkList.push_back(VPBB); 379 } 380 381 for (VPBasicBlock *VPBB : WorkList) { 382 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor()); 383 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) 384 R.moveBefore(*PredVPBB, PredVPBB->end()); 385 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); 386 auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent()); 387 if (ParentRegion && ParentRegion->getExiting() == VPBB) 388 ParentRegion->setExiting(PredVPBB); 389 for (auto *Succ : to_vector(VPBB->successors())) { 390 VPBlockUtils::disconnectBlocks(VPBB, Succ); 391 VPBlockUtils::connectBlocks(PredVPBB, Succ); 392 } 393 delete VPBB; 394 } 395 return !WorkList.empty(); 396 } 397 398 void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { 399 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 400 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 401 if (!IV || IV->getTruncInst()) 402 continue; 403 404 // A sequence of IR Casts has potentially been recorded for IV, which 405 // *must be bypassed* when the IV is vectorized, because the vectorized IV 406 // will produce the desired casted value. This sequence forms a def-use 407 // chain and is provided in reverse order, ending with the cast that uses 408 // the IV phi. Search for the recipe of the last cast in the chain and 409 // replace it with the original IV. Note that only the final cast is 410 // expected to have users outside the cast-chain and the dead casts left 411 // over will be cleaned up later. 412 auto &Casts = IV->getInductionDescriptor().getCastInsts(); 413 VPValue *FindMyCast = IV; 414 for (Instruction *IRCast : reverse(Casts)) { 415 VPRecipeBase *FoundUserCast = nullptr; 416 for (auto *U : FindMyCast->users()) { 417 auto *UserCast = cast<VPRecipeBase>(U); 418 if (UserCast->getNumDefinedValues() == 1 && 419 UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) { 420 FoundUserCast = UserCast; 421 break; 422 } 423 } 424 FindMyCast = FoundUserCast->getVPSingleValue(); 425 } 426 FindMyCast->replaceAllUsesWith(IV); 427 } 428 } 429 430 void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) { 431 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); 432 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; 433 for (VPUser *U : CanonicalIV->users()) { 434 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U); 435 if (WidenNewIV) 436 break; 437 } 438 439 if (!WidenNewIV) 440 return; 441 442 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 443 for (VPRecipeBase &Phi : HeaderVPBB->phis()) { 444 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 445 446 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() || 447 WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType()) 448 continue; 449 450 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides 451 // everything WidenNewIV's users need. That is, WidenOriginalIV will 452 // generate a vector phi or all users of WidenNewIV demand the first lane 453 // only. 454 if (any_of(WidenOriginalIV->users(), 455 [WidenOriginalIV](VPUser *U) { 456 return !U->usesScalars(WidenOriginalIV); 457 }) || 458 vputils::onlyFirstLaneUsed(WidenNewIV)) { 459 WidenNewIV->replaceAllUsesWith(WidenOriginalIV); 460 WidenNewIV->eraseFromParent(); 461 return; 462 } 463 } 464 } 465 466 void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { 467 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 468 Plan.getEntry()); 469 470 for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { 471 // The recipes in the block are processed in reverse order, to catch chains 472 // of dead recipes. 473 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { 474 // A user keeps R alive: 475 if (any_of(R.definedValues(), 476 [](VPValue *V) { return V->getNumUsers(); })) 477 continue; 478 479 // Having side effects keeps R alive, but do remove conditional assume 480 // instructions as their conditions may be flattened. 481 auto *RepR = dyn_cast<VPReplicateRecipe>(&R); 482 bool IsConditionalAssume = 483 RepR && RepR->isPredicated() && 484 match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); 485 if (R.mayHaveSideEffects() && !IsConditionalAssume) 486 continue; 487 488 R.eraseFromParent(); 489 } 490 } 491 } 492 493 static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID, 494 ScalarEvolution &SE, Instruction *TruncI, 495 Type *IVTy, VPValue *StartV, 496 VPValue *Step) { 497 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 498 auto IP = HeaderVPBB->getFirstNonPhi(); 499 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); 500 Type *TruncTy = TruncI ? TruncI->getType() : IVTy; 501 VPValue *BaseIV = CanonicalIV; 502 if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step, TruncTy)) { 503 BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step, 504 TruncI ? TruncI->getType() : nullptr); 505 HeaderVPBB->insert(BaseIV->getDefiningRecipe(), IP); 506 } 507 508 VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step); 509 HeaderVPBB->insert(Steps, IP); 510 return Steps; 511 } 512 513 void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { 514 SmallVector<VPRecipeBase *> ToRemove; 515 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 516 bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); 517 for (VPRecipeBase &Phi : HeaderVPBB->phis()) { 518 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); 519 if (!WideIV) 520 continue; 521 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { 522 return U->usesScalars(WideIV); 523 })) 524 continue; 525 526 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 527 VPValue *Steps = createScalarIVSteps( 528 Plan, ID, SE, WideIV->getTruncInst(), WideIV->getPHINode()->getType(), 529 WideIV->getStartValue(), WideIV->getStepValue()); 530 531 // Update scalar users of IV to use Step instead. 532 if (!HasOnlyVectorVFs) 533 WideIV->replaceAllUsesWith(Steps); 534 else 535 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) { 536 return U.usesScalars(WideIV); 537 }); 538 } 539 } 540 541 void VPlanTransforms::removeRedundantExpandSCEVRecipes(VPlan &Plan) { 542 DenseMap<const SCEV *, VPValue *> SCEV2VPV; 543 544 for (VPRecipeBase &R : 545 make_early_inc_range(*Plan.getEntry()->getEntryBasicBlock())) { 546 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R); 547 if (!ExpR) 548 continue; 549 550 auto I = SCEV2VPV.insert({ExpR->getSCEV(), ExpR}); 551 if (I.second) 552 continue; 553 ExpR->replaceAllUsesWith(I.first->second); 554 ExpR->eraseFromParent(); 555 } 556 } 557 558 static bool canSimplifyBranchOnCond(VPInstruction *Term) { 559 VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0)); 560 if (!Not || Not->getOpcode() != VPInstruction::Not) 561 return false; 562 563 VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0)); 564 return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; 565 } 566 567 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, 568 unsigned BestUF, 569 PredicatedScalarEvolution &PSE) { 570 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); 571 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); 572 VPBasicBlock *ExitingVPBB = 573 Plan.getVectorLoopRegion()->getExitingBasicBlock(); 574 auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); 575 // Try to simplify the branch condition if TC <= VF * UF when preparing to 576 // execute the plan for the main vector loop. We only do this if the 577 // terminator is: 578 // 1. BranchOnCount, or 579 // 2. BranchOnCond where the input is Not(ActiveLaneMask). 580 if (!Term || (Term->getOpcode() != VPInstruction::BranchOnCount && 581 (Term->getOpcode() != VPInstruction::BranchOnCond || 582 !canSimplifyBranchOnCond(Term)))) 583 return; 584 585 Type *IdxTy = 586 Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); 587 const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE); 588 ScalarEvolution &SE = *PSE.getSE(); 589 const SCEV *C = 590 SE.getConstant(TripCount->getType(), BestVF.getKnownMinValue() * BestUF); 591 if (TripCount->isZero() || 592 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) 593 return; 594 595 LLVMContext &Ctx = SE.getContext(); 596 auto *BOC = new VPInstruction( 597 VPInstruction::BranchOnCond, 598 {Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(Ctx))}); 599 Term->eraseFromParent(); 600 ExitingVPBB->appendRecipe(BOC); 601 Plan.setVF(BestVF); 602 Plan.setUF(BestUF); 603 // TODO: Further simplifications are possible 604 // 1. Replace inductions with constants. 605 // 2. Replace vector loop region with VPBasicBlock. 606 } 607 608 #ifndef NDEBUG 609 static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { 610 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 611 if (Region && Region->isReplicator()) { 612 assert(Region->getNumSuccessors() == 1 && 613 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 614 assert(R->getParent()->size() == 1 && 615 "A recipe in an original replicator region must be the only " 616 "recipe in its block"); 617 return Region; 618 } 619 return nullptr; 620 } 621 #endif 622 623 static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B, 624 VPDominatorTree &VPDT) { 625 if (A == B) 626 return false; 627 628 auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { 629 for (auto &R : *A->getParent()) { 630 if (&R == A) 631 return true; 632 if (&R == B) 633 return false; 634 } 635 llvm_unreachable("recipe not found"); 636 }; 637 const VPBlockBase *ParentA = A->getParent(); 638 const VPBlockBase *ParentB = B->getParent(); 639 if (ParentA == ParentB) 640 return LocalComesBefore(A, B); 641 642 assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) && 643 "No replicate regions expected at this point"); 644 assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) && 645 "No replicate regions expected at this point"); 646 return VPDT.properlyDominates(ParentA, ParentB); 647 } 648 649 /// Sink users of \p FOR after the recipe defining the previous value \p 650 /// Previous of the recurrence. \returns true if all users of \p FOR could be 651 /// re-arranged as needed or false if it is not possible. 652 static bool 653 sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, 654 VPRecipeBase *Previous, 655 VPDominatorTree &VPDT) { 656 // Collect recipes that need sinking. 657 SmallVector<VPRecipeBase *> WorkList; 658 SmallPtrSet<VPRecipeBase *, 8> Seen; 659 Seen.insert(Previous); 660 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { 661 // The previous value must not depend on the users of the recurrence phi. In 662 // that case, FOR is not a fixed order recurrence. 663 if (SinkCandidate == Previous) 664 return false; 665 666 if (isa<VPHeaderPHIRecipe>(SinkCandidate) || 667 !Seen.insert(SinkCandidate).second || 668 properlyDominates(Previous, SinkCandidate, VPDT)) 669 return true; 670 671 if (SinkCandidate->mayHaveSideEffects()) 672 return false; 673 674 WorkList.push_back(SinkCandidate); 675 return true; 676 }; 677 678 // Recursively sink users of FOR after Previous. 679 WorkList.push_back(FOR); 680 for (unsigned I = 0; I != WorkList.size(); ++I) { 681 VPRecipeBase *Current = WorkList[I]; 682 assert(Current->getNumDefinedValues() == 1 && 683 "only recipes with a single defined value expected"); 684 685 for (VPUser *User : Current->getVPSingleValue()->users()) { 686 if (auto *R = dyn_cast<VPRecipeBase>(User)) 687 if (!TryToPushSinkCandidate(R)) 688 return false; 689 } 690 } 691 692 // Keep recipes to sink ordered by dominance so earlier instructions are 693 // processed first. 694 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { 695 return properlyDominates(A, B, VPDT); 696 }); 697 698 for (VPRecipeBase *SinkCandidate : WorkList) { 699 if (SinkCandidate == FOR) 700 continue; 701 702 SinkCandidate->moveAfter(Previous); 703 Previous = SinkCandidate; 704 } 705 return true; 706 } 707 708 bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, 709 VPBuilder &Builder) { 710 VPDominatorTree VPDT; 711 VPDT.recalculate(Plan); 712 713 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; 714 for (VPRecipeBase &R : 715 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) 716 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 717 RecurrencePhis.push_back(FOR); 718 719 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { 720 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; 721 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); 722 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed 723 // to terminate. 724 while (auto *PrevPhi = 725 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Previous)) { 726 assert(PrevPhi->getParent() == FOR->getParent()); 727 assert(SeenPhis.insert(PrevPhi).second); 728 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); 729 } 730 731 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) 732 return false; 733 734 // Introduce a recipe to combine the incoming and previous values of a 735 // fixed-order recurrence. 736 VPBasicBlock *InsertBlock = Previous->getParent(); 737 if (isa<VPHeaderPHIRecipe>(Previous)) 738 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 739 else 740 Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator())); 741 742 auto *RecurSplice = cast<VPInstruction>( 743 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 744 {FOR, FOR->getBackedgeValue()})); 745 746 FOR->replaceAllUsesWith(RecurSplice); 747 // Set the first operand of RecurSplice to FOR again, after replacing 748 // all users. 749 RecurSplice->setOperand(0, FOR); 750 } 751 return true; 752 } 753 754 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { 755 for (VPRecipeBase &R : 756 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 757 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 758 if (!PhiR) 759 continue; 760 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 761 RecurKind RK = RdxDesc.getRecurrenceKind(); 762 if (RK != RecurKind::Add && RK != RecurKind::Mul) 763 continue; 764 765 SmallSetVector<VPValue *, 8> Worklist; 766 Worklist.insert(PhiR); 767 768 for (unsigned I = 0; I != Worklist.size(); ++I) { 769 VPValue *Cur = Worklist[I]; 770 if (auto *RecWithFlags = 771 dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) { 772 RecWithFlags->dropPoisonGeneratingFlags(); 773 } 774 775 for (VPUser *U : Cur->users()) { 776 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 777 if (!UserRecipe) 778 continue; 779 for (VPValue *V : UserRecipe->definedValues()) 780 Worklist.insert(V); 781 } 782 } 783 } 784 } 785 786 /// Returns true is \p V is constant one. 787 static bool isConstantOne(VPValue *V) { 788 if (!V->isLiveIn()) 789 return false; 790 auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue()); 791 return C && C->isOne(); 792 } 793 794 /// Returns the llvm::Instruction opcode for \p R. 795 static unsigned getOpcodeForRecipe(VPRecipeBase &R) { 796 if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R)) 797 return WidenR->getUnderlyingInstr()->getOpcode(); 798 if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R)) 799 return WidenC->getOpcode(); 800 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) 801 return RepR->getUnderlyingInstr()->getOpcode(); 802 if (auto *VPI = dyn_cast<VPInstruction>(&R)) 803 return VPI->getOpcode(); 804 return 0; 805 } 806 807 /// Try to simplify recipe \p R. 808 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { 809 switch (getOpcodeForRecipe(R)) { 810 case Instruction::Mul: { 811 VPValue *A = R.getOperand(0); 812 VPValue *B = R.getOperand(1); 813 if (isConstantOne(A)) 814 return R.getVPSingleValue()->replaceAllUsesWith(B); 815 if (isConstantOne(B)) 816 return R.getVPSingleValue()->replaceAllUsesWith(A); 817 break; 818 } 819 case Instruction::Trunc: { 820 VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe(); 821 if (!Ext) 822 break; 823 unsigned ExtOpcode = getOpcodeForRecipe(*Ext); 824 if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt) 825 break; 826 VPValue *A = Ext->getOperand(0); 827 VPValue *Trunc = R.getVPSingleValue(); 828 Type *TruncTy = TypeInfo.inferScalarType(Trunc); 829 Type *ATy = TypeInfo.inferScalarType(A); 830 if (TruncTy == ATy) { 831 Trunc->replaceAllUsesWith(A); 832 } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { 833 auto *VPC = 834 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); 835 VPC->insertBefore(&R); 836 Trunc->replaceAllUsesWith(VPC); 837 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { 838 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); 839 VPC->insertBefore(&R); 840 Trunc->replaceAllUsesWith(VPC); 841 } 842 #ifndef NDEBUG 843 // Verify that the cached type info is for both A and its users is still 844 // accurate by comparing it to freshly computed types. 845 VPTypeAnalysis TypeInfo2(TypeInfo.getContext()); 846 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); 847 for (VPUser *U : A->users()) { 848 auto *R = dyn_cast<VPRecipeBase>(U); 849 if (!R) 850 continue; 851 for (VPValue *VPV : R->definedValues()) 852 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); 853 } 854 #endif 855 break; 856 } 857 default: 858 break; 859 } 860 } 861 862 /// Try to simplify the recipes in \p Plan. 863 static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { 864 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( 865 Plan.getEntry()); 866 VPTypeAnalysis TypeInfo(Ctx); 867 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { 868 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 869 simplifyRecipe(R, TypeInfo); 870 } 871 } 872 } 873 874 void VPlanTransforms::truncateToMinimalBitwidths( 875 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, 876 LLVMContext &Ctx) { 877 #ifndef NDEBUG 878 // Count the processed recipes and cross check the count later with MinBWs 879 // size, to make sure all entries in MinBWs have been handled. 880 unsigned NumProcessedRecipes = 0; 881 #endif 882 // Keep track of created truncates, so they can be re-used. Note that we 883 // cannot use RAUW after creating a new truncate, as this would could make 884 // other uses have different types for their operands, making them invalidly 885 // typed. 886 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; 887 VPTypeAnalysis TypeInfo(Ctx); 888 VPBasicBlock *PH = Plan.getEntry(); 889 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 890 vp_depth_first_deep(Plan.getVectorLoopRegion()))) { 891 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { 892 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, 893 VPWidenSelectRecipe>(&R)) 894 continue; 895 896 VPValue *ResultVPV = R.getVPSingleValue(); 897 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue()); 898 unsigned NewResSizeInBits = MinBWs.lookup(UI); 899 if (!NewResSizeInBits) 900 continue; 901 902 #ifndef NDEBUG 903 NumProcessedRecipes++; 904 #endif 905 // If the value wasn't vectorized, we must maintain the original scalar 906 // type. Skip those here, after incrementing NumProcessedRecipes. Also 907 // skip casts which do not need to be handled explicitly here, as 908 // redundant casts will be removed during recipe simplification. 909 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(&R)) { 910 #ifndef NDEBUG 911 // If any of the operands is a live-in and not used by VPWidenRecipe or 912 // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as 913 // processed as well. When MinBWs is currently constructed, there is no 914 // information about whether recipes are widened or replicated and in 915 // case they are reciplicated the operands are not truncated. Counting 916 // them them here ensures we do not miss any recipes in MinBWs. 917 // TODO: Remove once the analysis is done on VPlan. 918 for (VPValue *Op : R.operands()) { 919 if (!Op->isLiveIn()) 920 continue; 921 auto *UV = dyn_cast_or_null<Instruction>(Op->getUnderlyingValue()); 922 if (UV && MinBWs.contains(UV) && !ProcessedTruncs.contains(Op) && 923 all_of(Op->users(), [](VPUser *U) { 924 return !isa<VPWidenRecipe, VPWidenSelectRecipe>(U); 925 })) { 926 // Add an entry to ProcessedTruncs to avoid counting the same 927 // operand multiple times. 928 ProcessedTruncs[Op] = nullptr; 929 NumProcessedRecipes += 1; 930 } 931 } 932 #endif 933 continue; 934 } 935 936 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV); 937 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); 938 assert(OldResTy->isIntegerTy() && "only integer types supported"); 939 if (OldResSizeInBits == NewResSizeInBits) 940 continue; 941 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?"); 942 (void)OldResSizeInBits; 943 944 auto *NewResTy = IntegerType::get(Ctx, NewResSizeInBits); 945 946 // Shrink operands by introducing truncates as needed. 947 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0; 948 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { 949 auto *Op = R.getOperand(Idx); 950 unsigned OpSizeInBits = 951 TypeInfo.inferScalarType(Op)->getScalarSizeInBits(); 952 if (OpSizeInBits == NewResSizeInBits) 953 continue; 954 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate"); 955 auto [ProcessedIter, IterIsEmpty] = 956 ProcessedTruncs.insert({Op, nullptr}); 957 VPWidenCastRecipe *NewOp = 958 IterIsEmpty 959 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) 960 : ProcessedIter->second; 961 R.setOperand(Idx, NewOp); 962 if (!IterIsEmpty) 963 continue; 964 ProcessedIter->second = NewOp; 965 if (!Op->isLiveIn()) { 966 NewOp->insertBefore(&R); 967 } else { 968 PH->appendRecipe(NewOp); 969 #ifndef NDEBUG 970 auto *OpInst = dyn_cast<Instruction>(Op->getLiveInIRValue()); 971 bool IsContained = MinBWs.contains(OpInst); 972 NumProcessedRecipes += IsContained; 973 #endif 974 } 975 } 976 977 // Any wrapping introduced by shrinking this operation shouldn't be 978 // considered undefined behavior. So, we can't unconditionally copy 979 // arithmetic wrapping flags to VPW. 980 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) 981 VPW->dropPoisonGeneratingFlags(); 982 983 // Extend result to original width. 984 auto *Ext = new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); 985 Ext->insertAfter(&R); 986 ResultVPV->replaceAllUsesWith(Ext); 987 Ext->setOperand(0, ResultVPV); 988 } 989 } 990 991 assert(MinBWs.size() == NumProcessedRecipes && 992 "some entries in MinBWs haven't been processed"); 993 } 994 995 void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { 996 removeRedundantCanonicalIVs(Plan); 997 removeRedundantInductionCasts(Plan); 998 999 optimizeInductions(Plan, SE); 1000 simplifyRecipes(Plan, SE.getContext()); 1001 removeDeadRecipes(Plan); 1002 1003 createAndOptimizeReplicateRegions(Plan); 1004 1005 removeRedundantExpandSCEVRecipes(Plan); 1006 mergeBlocksIntoPredecessors(Plan); 1007 } 1008 1009 // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace 1010 // the loop terminator with a branch-on-cond recipe with the negated 1011 // active-lane-mask as operand. Note that this turns the loop into an 1012 // uncountable one. Only the existing terminator is replaced, all other existing 1013 // recipes/users remain unchanged, except for poison-generating flags being 1014 // dropped from the canonical IV increment. Return the created 1015 // VPActiveLaneMaskPHIRecipe. 1016 // 1017 // The function uses the following definitions: 1018 // 1019 // %TripCount = DataWithControlFlowWithoutRuntimeCheck ? 1020 // calculate-trip-count-minus-VF (original TC) : original TC 1021 // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? 1022 // CanonicalIVPhi : CanonicalIVIncrement 1023 // %StartV is the canonical induction start value. 1024 // 1025 // The function adds the following recipes: 1026 // 1027 // vector.ph: 1028 // %TripCount = calculate-trip-count-minus-VF (original TC) 1029 // [if DataWithControlFlowWithoutRuntimeCheck] 1030 // %EntryInc = canonical-iv-increment-for-part %StartV 1031 // %EntryALM = active-lane-mask %EntryInc, %TripCount 1032 // 1033 // vector.body: 1034 // ... 1035 // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] 1036 // ... 1037 // %InLoopInc = canonical-iv-increment-for-part %IncrementValue 1038 // %ALM = active-lane-mask %InLoopInc, TripCount 1039 // %Negated = Not %ALM 1040 // branch-on-cond %Negated 1041 // 1042 static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( 1043 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { 1044 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 1045 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 1046 auto *CanonicalIVPHI = Plan.getCanonicalIV(); 1047 VPValue *StartV = CanonicalIVPHI->getStartValue(); 1048 1049 auto *CanonicalIVIncrement = 1050 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); 1051 // TODO: Check if dropping the flags is needed if 1052 // !DataAndControlFlowWithoutRuntimeCheck. 1053 CanonicalIVIncrement->dropPoisonGeneratingFlags(); 1054 DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); 1055 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 1056 // we have to take unrolling into account. Each part needs to start at 1057 // Part * VF 1058 auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor()); 1059 VPBuilder Builder(VecPreheader); 1060 1061 // Create the ActiveLaneMask instruction using the correct start values. 1062 VPValue *TC = Plan.getTripCount(); 1063 1064 VPValue *TripCount, *IncrementValue; 1065 if (!DataAndControlFlowWithoutRuntimeCheck) { 1066 // When the loop is guarded by a runtime overflow check for the loop 1067 // induction variable increment by VF, we can increment the value before 1068 // the get.active.lane mask and use the unmodified tripcount. 1069 IncrementValue = CanonicalIVIncrement; 1070 TripCount = TC; 1071 } else { 1072 // When avoiding a runtime check, the active.lane.mask inside the loop 1073 // uses a modified trip count and the induction variable increment is 1074 // done after the active.lane.mask intrinsic is called. 1075 IncrementValue = CanonicalIVPHI; 1076 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF, 1077 {TC}, DL); 1078 } 1079 auto *EntryIncrement = Builder.createOverflowingOp( 1080 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL, 1081 "index.part.next"); 1082 1083 // Create the active lane mask instruction in the VPlan preheader. 1084 auto *EntryALM = 1085 Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, 1086 DL, "active.lane.mask.entry"); 1087 1088 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 1089 // preheader ActiveLaneMask instruction. 1090 auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 1091 LaneMaskPhi->insertAfter(CanonicalIVPHI); 1092 1093 // Create the active lane mask for the next iteration of the loop before the 1094 // original terminator. 1095 VPRecipeBase *OriginalTerminator = EB->getTerminator(); 1096 Builder.setInsertPoint(OriginalTerminator); 1097 auto *InLoopIncrement = 1098 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, 1099 {IncrementValue}, {false, false}, DL); 1100 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, 1101 {InLoopIncrement, TripCount}, DL, 1102 "active.lane.mask.next"); 1103 LaneMaskPhi->addOperand(ALM); 1104 1105 // Replace the original terminator with BranchOnCond. We have to invert the 1106 // mask here because a true condition means jumping to the exit block. 1107 auto *NotMask = Builder.createNot(ALM, DL); 1108 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); 1109 OriginalTerminator->eraseFromParent(); 1110 return LaneMaskPhi; 1111 } 1112 1113 void VPlanTransforms::addActiveLaneMask( 1114 VPlan &Plan, bool UseActiveLaneMaskForControlFlow, 1115 bool DataAndControlFlowWithoutRuntimeCheck) { 1116 assert((!DataAndControlFlowWithoutRuntimeCheck || 1117 UseActiveLaneMaskForControlFlow) && 1118 "DataAndControlFlowWithoutRuntimeCheck implies " 1119 "UseActiveLaneMaskForControlFlow"); 1120 1121 auto FoundWidenCanonicalIVUser = 1122 find_if(Plan.getCanonicalIV()->users(), 1123 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); 1124 assert(FoundWidenCanonicalIVUser && 1125 "Must have widened canonical IV when tail folding!"); 1126 auto *WideCanonicalIV = 1127 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); 1128 VPRecipeBase *LaneMask; 1129 if (UseActiveLaneMaskForControlFlow) { 1130 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( 1131 Plan, DataAndControlFlowWithoutRuntimeCheck); 1132 } else { 1133 LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask, 1134 {WideCanonicalIV, Plan.getTripCount()}, 1135 nullptr, "active.lane.mask"); 1136 LaneMask->insertAfter(WideCanonicalIV); 1137 } 1138 1139 // Walk users of WideCanonicalIV and replace all compares of the form 1140 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an 1141 // active-lane-mask. 1142 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 1143 for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { 1144 auto *CompareToReplace = dyn_cast<VPInstruction>(U); 1145 if (!CompareToReplace || 1146 CompareToReplace->getOpcode() != Instruction::ICmp || 1147 CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || 1148 CompareToReplace->getOperand(1) != BTC) 1149 continue; 1150 1151 assert(CompareToReplace->getOperand(0) == WideCanonicalIV && 1152 "WidenCanonicalIV must be the first operand of the compare"); 1153 CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue()); 1154 CompareToReplace->eraseFromParent(); 1155 } 1156 } 1157