1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This is the LLVM vectorization plan. It represents a candidate for 11 /// vectorization, allowing to plan and optimize how to vectorize a given loop 12 /// before generating LLVM-IR. 13 /// The vectorizer uses vectorization plans to estimate the costs of potential 14 /// candidates and if profitable to execute the desired plan, generating vector 15 /// LLVM-IR code. 16 /// 17 //===----------------------------------------------------------------------===// 18 19 #include "VPlan.h" 20 #include "VPlanDominatorTree.h" 21 #include "llvm/ADT/DepthFirstIterator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/Twine.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CFG.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Support/GenericDomTreeConstruction.h" 38 #include "llvm/Support/GraphWriter.h" 39 #include "llvm/Support/raw_ostream.h" 40 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 41 #include <cassert> 42 #include <iterator> 43 #include <string> 44 #include <vector> 45 46 using namespace llvm; 47 extern cl::opt<bool> EnableVPlanNativePath; 48 49 #define DEBUG_TYPE "vplan" 50 51 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { 52 const VPInstruction *Instr = dyn_cast<VPInstruction>(&V); 53 VPSlotTracker SlotTracker( 54 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 55 V.print(OS, SlotTracker); 56 return OS; 57 } 58 59 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { 60 if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this)) 61 Instr->print(OS, SlotTracker); 62 else 63 printAsOperand(OS, SlotTracker); 64 } 65 66 void VPValue::dump() const { 67 const VPInstruction *Instr = dyn_cast<VPInstruction>(this); 68 VPSlotTracker SlotTracker( 69 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 70 print(dbgs(), SlotTracker); 71 dbgs() << "\n"; 72 } 73 74 void VPRecipeBase::dump() const { 75 VPSlotTracker SlotTracker(nullptr); 76 print(dbgs(), "", SlotTracker); 77 dbgs() << "\n"; 78 } 79 80 VPUser *VPRecipeBase::toVPUser() { 81 if (auto *U = dyn_cast<VPInstruction>(this)) 82 return U; 83 if (auto *U = dyn_cast<VPWidenRecipe>(this)) 84 return U; 85 if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) 86 return U; 87 if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) 88 return U; 89 if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) 90 return U; 91 if (auto *U = dyn_cast<VPBlendRecipe>(this)) 92 return U; 93 if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) 94 return U; 95 if (auto *U = dyn_cast<VPReplicateRecipe>(this)) 96 return U; 97 if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) 98 return U; 99 if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) 100 return U; 101 return nullptr; 102 } 103 104 // Get the top-most entry block of \p Start. This is the entry block of the 105 // containing VPlan. This function is templated to support both const and non-const blocks 106 template <typename T> static T *getPlanEntry(T *Start) { 107 T *Next = Start; 108 T *Current = Start; 109 while ((Next = Next->getParent())) 110 Current = Next; 111 112 SmallSetVector<T *, 8> WorkList; 113 WorkList.insert(Current); 114 115 for (unsigned i = 0; i < WorkList.size(); i++) { 116 T *Current = WorkList[i]; 117 if (Current->getNumPredecessors() == 0) 118 return Current; 119 auto &Predecessors = Current->getPredecessors(); 120 WorkList.insert(Predecessors.begin(), Predecessors.end()); 121 } 122 123 llvm_unreachable("VPlan without any entry node without predecessors"); 124 } 125 126 VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; } 127 128 const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; } 129 130 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly. 131 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { 132 const VPBlockBase *Block = this; 133 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 134 Block = Region->getEntry(); 135 return cast<VPBasicBlock>(Block); 136 } 137 138 VPBasicBlock *VPBlockBase::getEntryBasicBlock() { 139 VPBlockBase *Block = this; 140 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 141 Block = Region->getEntry(); 142 return cast<VPBasicBlock>(Block); 143 } 144 145 void VPBlockBase::setPlan(VPlan *ParentPlan) { 146 assert(ParentPlan->getEntry() == this && 147 "Can only set plan on its entry block."); 148 Plan = ParentPlan; 149 } 150 151 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly. 152 const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { 153 const VPBlockBase *Block = this; 154 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 155 Block = Region->getExit(); 156 return cast<VPBasicBlock>(Block); 157 } 158 159 VPBasicBlock *VPBlockBase::getExitBasicBlock() { 160 VPBlockBase *Block = this; 161 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 162 Block = Region->getExit(); 163 return cast<VPBasicBlock>(Block); 164 } 165 166 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { 167 if (!Successors.empty() || !Parent) 168 return this; 169 assert(Parent->getExit() == this && 170 "Block w/o successors not the exit of its parent."); 171 return Parent->getEnclosingBlockWithSuccessors(); 172 } 173 174 VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { 175 if (!Predecessors.empty() || !Parent) 176 return this; 177 assert(Parent->getEntry() == this && 178 "Block w/o predecessors not the entry of its parent."); 179 return Parent->getEnclosingBlockWithPredecessors(); 180 } 181 182 void VPBlockBase::deleteCFG(VPBlockBase *Entry) { 183 SmallVector<VPBlockBase *, 8> Blocks; 184 for (VPBlockBase *Block : depth_first(Entry)) 185 Blocks.push_back(Block); 186 187 for (VPBlockBase *Block : Blocks) 188 delete Block; 189 } 190 191 BasicBlock * 192 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { 193 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. 194 // Pred stands for Predessor. Prev stands for Previous - last visited/created. 195 BasicBlock *PrevBB = CFG.PrevBB; 196 BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), 197 PrevBB->getParent(), CFG.LastBB); 198 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); 199 200 // Hook up the new basic block to its predecessors. 201 for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { 202 VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); 203 auto &PredVPSuccessors = PredVPBB->getSuccessors(); 204 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; 205 206 // In outer loop vectorization scenario, the predecessor BBlock may not yet 207 // be visited(backedge). Mark the VPBasicBlock for fixup at the end of 208 // vectorization. We do not encounter this case in inner loop vectorization 209 // as we start out by building a loop skeleton with the vector loop header 210 // and latch blocks. As a result, we never enter this function for the 211 // header block in the non VPlan-native path. 212 if (!PredBB) { 213 assert(EnableVPlanNativePath && 214 "Unexpected null predecessor in non VPlan-native path"); 215 CFG.VPBBsToFix.push_back(PredVPBB); 216 continue; 217 } 218 219 assert(PredBB && "Predecessor basic-block not found building successor."); 220 auto *PredBBTerminator = PredBB->getTerminator(); 221 LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); 222 if (isa<UnreachableInst>(PredBBTerminator)) { 223 assert(PredVPSuccessors.size() == 1 && 224 "Predecessor ending w/o branch must have single successor."); 225 PredBBTerminator->eraseFromParent(); 226 BranchInst::Create(NewBB, PredBB); 227 } else { 228 assert(PredVPSuccessors.size() == 2 && 229 "Predecessor ending with branch must have two successors."); 230 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; 231 assert(!PredBBTerminator->getSuccessor(idx) && 232 "Trying to reset an existing successor block."); 233 PredBBTerminator->setSuccessor(idx, NewBB); 234 } 235 } 236 return NewBB; 237 } 238 239 void VPBasicBlock::execute(VPTransformState *State) { 240 bool Replica = State->Instance && 241 !(State->Instance->Part == 0 && State->Instance->Lane == 0); 242 VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; 243 VPBlockBase *SingleHPred = nullptr; 244 BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. 245 246 // 1. Create an IR basic block, or reuse the last one if possible. 247 // The last IR basic block is reused, as an optimization, in three cases: 248 // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; 249 // B. when the current VPBB has a single (hierarchical) predecessor which 250 // is PrevVPBB and the latter has a single (hierarchical) successor; and 251 // C. when the current VPBB is an entry of a region replica - where PrevVPBB 252 // is the exit of this region from a previous instance, or the predecessor 253 // of this region. 254 if (PrevVPBB && /* A */ 255 !((SingleHPred = getSingleHierarchicalPredecessor()) && 256 SingleHPred->getExitBasicBlock() == PrevVPBB && 257 PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ 258 !(Replica && getPredecessors().empty())) { /* C */ 259 NewBB = createEmptyBasicBlock(State->CFG); 260 State->Builder.SetInsertPoint(NewBB); 261 // Temporarily terminate with unreachable until CFG is rewired. 262 UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 263 State->Builder.SetInsertPoint(Terminator); 264 // Register NewBB in its loop. In innermost loops its the same for all BB's. 265 Loop *L = State->LI->getLoopFor(State->CFG.LastBB); 266 L->addBasicBlockToLoop(NewBB, *State->LI); 267 State->CFG.PrevBB = NewBB; 268 } 269 270 // 2. Fill the IR basic block with IR instructions. 271 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() 272 << " in BB:" << NewBB->getName() << '\n'); 273 274 State->CFG.VPBB2IRBB[this] = NewBB; 275 State->CFG.PrevVPBB = this; 276 277 for (VPRecipeBase &Recipe : Recipes) 278 Recipe.execute(*State); 279 280 VPValue *CBV; 281 if (EnableVPlanNativePath && (CBV = getCondBit())) { 282 Value *IRCBV = CBV->getUnderlyingValue(); 283 assert(IRCBV && "Unexpected null underlying value for condition bit"); 284 285 // Condition bit value in a VPBasicBlock is used as the branch selector. In 286 // the VPlan-native path case, since all branches are uniform we generate a 287 // branch instruction using the condition value from vector lane 0 and dummy 288 // successors. The successors are fixed later when the successor blocks are 289 // visited. 290 Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0); 291 NewCond = State->Builder.CreateExtractElement(NewCond, 292 State->Builder.getInt32(0)); 293 294 // Replace the temporary unreachable terminator with the new conditional 295 // branch. 296 auto *CurrentTerminator = NewBB->getTerminator(); 297 assert(isa<UnreachableInst>(CurrentTerminator) && 298 "Expected to replace unreachable terminator with conditional " 299 "branch."); 300 auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); 301 CondBr->setSuccessor(0, nullptr); 302 ReplaceInstWithInst(CurrentTerminator, CondBr); 303 } 304 305 LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); 306 } 307 308 void VPRegionBlock::execute(VPTransformState *State) { 309 ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); 310 311 if (!isReplicator()) { 312 // Visit the VPBlocks connected to "this", starting from it. 313 for (VPBlockBase *Block : RPOT) { 314 if (EnableVPlanNativePath) { 315 // The inner loop vectorization path does not represent loop preheader 316 // and exit blocks as part of the VPlan. In the VPlan-native path, skip 317 // vectorizing loop preheader block. In future, we may replace this 318 // check with the check for loop preheader. 319 if (Block->getNumPredecessors() == 0) 320 continue; 321 322 // Skip vectorizing loop exit block. In future, we may replace this 323 // check with the check for loop exit. 324 if (Block->getNumSuccessors() == 0) 325 continue; 326 } 327 328 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 329 Block->execute(State); 330 } 331 return; 332 } 333 334 assert(!State->Instance && "Replicating a Region with non-null instance."); 335 336 // Enter replicating mode. 337 State->Instance = {0, 0}; 338 339 for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { 340 State->Instance->Part = Part; 341 assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); 342 for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; 343 ++Lane) { 344 State->Instance->Lane = Lane; 345 // Visit the VPBlocks connected to \p this, starting from it. 346 for (VPBlockBase *Block : RPOT) { 347 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 348 Block->execute(State); 349 } 350 } 351 } 352 353 // Exit replicating mode. 354 State->Instance.reset(); 355 } 356 357 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 358 assert(!Parent && "Recipe already in some VPBasicBlock"); 359 assert(InsertPos->getParent() && 360 "Insertion position not in any VPBasicBlock"); 361 Parent = InsertPos->getParent(); 362 Parent->getRecipeList().insert(InsertPos->getIterator(), this); 363 } 364 365 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 366 assert(!Parent && "Recipe already in some VPBasicBlock"); 367 assert(InsertPos->getParent() && 368 "Insertion position not in any VPBasicBlock"); 369 Parent = InsertPos->getParent(); 370 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 371 } 372 373 void VPRecipeBase::removeFromParent() { 374 assert(getParent() && "Recipe not in any VPBasicBlock"); 375 getParent()->getRecipeList().remove(getIterator()); 376 Parent = nullptr; 377 } 378 379 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 380 assert(getParent() && "Recipe not in any VPBasicBlock"); 381 return getParent()->getRecipeList().erase(getIterator()); 382 } 383 384 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 385 removeFromParent(); 386 insertAfter(InsertPos); 387 } 388 389 void VPInstruction::generateInstruction(VPTransformState &State, 390 unsigned Part) { 391 IRBuilder<> &Builder = State.Builder; 392 393 if (Instruction::isBinaryOp(getOpcode())) { 394 Value *A = State.get(getOperand(0), Part); 395 Value *B = State.get(getOperand(1), Part); 396 Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); 397 State.set(this, V, Part); 398 return; 399 } 400 401 switch (getOpcode()) { 402 case VPInstruction::Not: { 403 Value *A = State.get(getOperand(0), Part); 404 Value *V = Builder.CreateNot(A); 405 State.set(this, V, Part); 406 break; 407 } 408 case VPInstruction::ICmpULE: { 409 Value *IV = State.get(getOperand(0), Part); 410 Value *TC = State.get(getOperand(1), Part); 411 Value *V = Builder.CreateICmpULE(IV, TC); 412 State.set(this, V, Part); 413 break; 414 } 415 case Instruction::Select: { 416 Value *Cond = State.get(getOperand(0), Part); 417 Value *Op1 = State.get(getOperand(1), Part); 418 Value *Op2 = State.get(getOperand(2), Part); 419 Value *V = Builder.CreateSelect(Cond, Op1, Op2); 420 State.set(this, V, Part); 421 break; 422 } 423 case VPInstruction::ActiveLaneMask: { 424 // Get first lane of vector induction variable. 425 Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); 426 // Get the original loop tripcount. 427 Value *ScalarTC = State.TripCount; 428 429 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 430 auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); 431 Instruction *Call = Builder.CreateIntrinsic( 432 Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, 433 {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); 434 State.set(this, Call, Part); 435 break; 436 } 437 default: 438 llvm_unreachable("Unsupported opcode for instruction"); 439 } 440 } 441 442 void VPInstruction::execute(VPTransformState &State) { 443 assert(!State.Instance && "VPInstruction executing an Instance"); 444 for (unsigned Part = 0; Part < State.UF; ++Part) 445 generateInstruction(State, Part); 446 } 447 448 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 449 VPSlotTracker &SlotTracker) const { 450 O << "\"EMIT "; 451 print(O, SlotTracker); 452 } 453 454 void VPInstruction::print(raw_ostream &O) const { 455 VPSlotTracker SlotTracker(getParent()->getPlan()); 456 print(O, SlotTracker); 457 } 458 459 void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 460 if (hasResult()) { 461 printAsOperand(O, SlotTracker); 462 O << " = "; 463 } 464 465 switch (getOpcode()) { 466 case VPInstruction::Not: 467 O << "not"; 468 break; 469 case VPInstruction::ICmpULE: 470 O << "icmp ule"; 471 break; 472 case VPInstruction::SLPLoad: 473 O << "combined load"; 474 break; 475 case VPInstruction::SLPStore: 476 O << "combined store"; 477 break; 478 case VPInstruction::ActiveLaneMask: 479 O << "active lane mask"; 480 break; 481 482 default: 483 O << Instruction::getOpcodeName(getOpcode()); 484 } 485 486 for (const VPValue *Operand : operands()) { 487 O << " "; 488 Operand->printAsOperand(O, SlotTracker); 489 } 490 } 491 492 /// Generate the code inside the body of the vectorized loop. Assumes a single 493 /// LoopVectorBody basic-block was created for this. Introduce additional 494 /// basic-blocks as needed, and fill them all. 495 void VPlan::execute(VPTransformState *State) { 496 // -1. Check if the backedge taken count is needed, and if so build it. 497 if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { 498 Value *TC = State->TripCount; 499 IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); 500 auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), 501 "trip.count.minus.1"); 502 auto VF = State->VF; 503 Value *VTCMO = 504 VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); 505 for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) 506 State->set(BackedgeTakenCount, VTCMO, Part); 507 } 508 509 // 0. Set the reverse mapping from VPValues to Values for code generation. 510 for (auto &Entry : Value2VPValue) 511 State->VPValue2Value[Entry.second] = Entry.first; 512 513 BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; 514 BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); 515 assert(VectorHeaderBB && "Loop preheader does not have a single successor."); 516 517 // 1. Make room to generate basic-blocks inside loop body if needed. 518 BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( 519 VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); 520 Loop *L = State->LI->getLoopFor(VectorHeaderBB); 521 L->addBasicBlockToLoop(VectorLatchBB, *State->LI); 522 // Remove the edge between Header and Latch to allow other connections. 523 // Temporarily terminate with unreachable until CFG is rewired. 524 // Note: this asserts the generated code's assumption that 525 // getFirstInsertionPt() can be dereferenced into an Instruction. 526 VectorHeaderBB->getTerminator()->eraseFromParent(); 527 State->Builder.SetInsertPoint(VectorHeaderBB); 528 UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 529 State->Builder.SetInsertPoint(Terminator); 530 531 // 2. Generate code in loop body. 532 State->CFG.PrevVPBB = nullptr; 533 State->CFG.PrevBB = VectorHeaderBB; 534 State->CFG.LastBB = VectorLatchBB; 535 536 for (VPBlockBase *Block : depth_first(Entry)) 537 Block->execute(State); 538 539 // Setup branch terminator successors for VPBBs in VPBBsToFix based on 540 // VPBB's successors. 541 for (auto VPBB : State->CFG.VPBBsToFix) { 542 assert(EnableVPlanNativePath && 543 "Unexpected VPBBsToFix in non VPlan-native path"); 544 BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; 545 assert(BB && "Unexpected null basic block for VPBB"); 546 547 unsigned Idx = 0; 548 auto *BBTerminator = BB->getTerminator(); 549 550 for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { 551 VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); 552 BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); 553 ++Idx; 554 } 555 } 556 557 // 3. Merge the temporary latch created with the last basic-block filled. 558 BasicBlock *LastBB = State->CFG.PrevBB; 559 // Connect LastBB to VectorLatchBB to facilitate their merge. 560 assert((EnableVPlanNativePath || 561 isa<UnreachableInst>(LastBB->getTerminator())) && 562 "Expected InnerLoop VPlan CFG to terminate with unreachable"); 563 assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && 564 "Expected VPlan CFG to terminate with branch in NativePath"); 565 LastBB->getTerminator()->eraseFromParent(); 566 BranchInst::Create(VectorLatchBB, LastBB); 567 568 // Merge LastBB with Latch. 569 bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); 570 (void)Merged; 571 assert(Merged && "Could not merge last basic block with latch."); 572 VectorLatchBB = LastBB; 573 574 // We do not attempt to preserve DT for outer loop vectorization currently. 575 if (!EnableVPlanNativePath) 576 updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, 577 L->getExitBlock()); 578 } 579 580 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 581 LLVM_DUMP_METHOD 582 void VPlan::dump() const { dbgs() << *this << '\n'; } 583 #endif 584 585 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, 586 BasicBlock *LoopLatchBB, 587 BasicBlock *LoopExitBB) { 588 BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); 589 assert(LoopHeaderBB && "Loop preheader does not have a single successor."); 590 // The vector body may be more than a single basic-block by this point. 591 // Update the dominator tree information inside the vector body by propagating 592 // it from header to latch, expecting only triangular control-flow, if any. 593 BasicBlock *PostDomSucc = nullptr; 594 for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { 595 // Get the list of successors of this block. 596 std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); 597 assert(Succs.size() <= 2 && 598 "Basic block in vector loop has more than 2 successors."); 599 PostDomSucc = Succs[0]; 600 if (Succs.size() == 1) { 601 assert(PostDomSucc->getSinglePredecessor() && 602 "PostDom successor has more than one predecessor."); 603 DT->addNewBlock(PostDomSucc, BB); 604 continue; 605 } 606 BasicBlock *InterimSucc = Succs[1]; 607 if (PostDomSucc->getSingleSuccessor() == InterimSucc) { 608 PostDomSucc = Succs[1]; 609 InterimSucc = Succs[0]; 610 } 611 assert(InterimSucc->getSingleSuccessor() == PostDomSucc && 612 "One successor of a basic block does not lead to the other."); 613 assert(InterimSucc->getSinglePredecessor() && 614 "Interim successor has more than one predecessor."); 615 assert(PostDomSucc->hasNPredecessors(2) && 616 "PostDom successor has more than two predecessors."); 617 DT->addNewBlock(InterimSucc, BB); 618 DT->addNewBlock(PostDomSucc, BB); 619 } 620 // Latch block is a new dominator for the loop exit. 621 DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); 622 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 623 } 624 625 const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { 626 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + 627 Twine(getOrCreateBID(Block)); 628 } 629 630 const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { 631 const std::string &Name = Block->getName(); 632 if (!Name.empty()) 633 return Name; 634 return "VPB" + Twine(getOrCreateBID(Block)); 635 } 636 637 void VPlanPrinter::dump() { 638 Depth = 1; 639 bumpIndent(0); 640 OS << "digraph VPlan {\n"; 641 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; 642 if (!Plan.getName().empty()) 643 OS << "\\n" << DOT::EscapeString(Plan.getName()); 644 if (Plan.BackedgeTakenCount) { 645 OS << ", where:\\n"; 646 Plan.BackedgeTakenCount->print(OS, SlotTracker); 647 OS << " := BackedgeTakenCount"; 648 } 649 OS << "\"]\n"; 650 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; 651 OS << "edge [fontname=Courier, fontsize=30]\n"; 652 OS << "compound=true\n"; 653 654 for (const VPBlockBase *Block : depth_first(Plan.getEntry())) 655 dumpBlock(Block); 656 657 OS << "}\n"; 658 } 659 660 void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { 661 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block)) 662 dumpBasicBlock(BasicBlock); 663 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 664 dumpRegion(Region); 665 else 666 llvm_unreachable("Unsupported kind of VPBlock."); 667 } 668 669 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, 670 bool Hidden, const Twine &Label) { 671 // Due to "dot" we print an edge between two regions as an edge between the 672 // exit basic block and the entry basic of the respective regions. 673 const VPBlockBase *Tail = From->getExitBasicBlock(); 674 const VPBlockBase *Head = To->getEntryBasicBlock(); 675 OS << Indent << getUID(Tail) << " -> " << getUID(Head); 676 OS << " [ label=\"" << Label << '\"'; 677 if (Tail != From) 678 OS << " ltail=" << getUID(From); 679 if (Head != To) 680 OS << " lhead=" << getUID(To); 681 if (Hidden) 682 OS << "; splines=none"; 683 OS << "]\n"; 684 } 685 686 void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { 687 auto &Successors = Block->getSuccessors(); 688 if (Successors.size() == 1) 689 drawEdge(Block, Successors.front(), false, ""); 690 else if (Successors.size() == 2) { 691 drawEdge(Block, Successors.front(), false, "T"); 692 drawEdge(Block, Successors.back(), false, "F"); 693 } else { 694 unsigned SuccessorNumber = 0; 695 for (auto *Successor : Successors) 696 drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); 697 } 698 } 699 700 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { 701 OS << Indent << getUID(BasicBlock) << " [label =\n"; 702 bumpIndent(1); 703 OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; 704 bumpIndent(1); 705 706 // Dump the block predicate. 707 const VPValue *Pred = BasicBlock->getPredicate(); 708 if (Pred) { 709 OS << " +\n" << Indent << " \"BlockPredicate: "; 710 if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { 711 PredI->printAsOperand(OS, SlotTracker); 712 OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) 713 << ")\\l\""; 714 } else 715 Pred->printAsOperand(OS, SlotTracker); 716 } 717 718 for (const VPRecipeBase &Recipe : *BasicBlock) { 719 OS << " +\n" << Indent; 720 Recipe.print(OS, Indent, SlotTracker); 721 OS << "\\l\""; 722 } 723 724 // Dump the condition bit. 725 const VPValue *CBV = BasicBlock->getCondBit(); 726 if (CBV) { 727 OS << " +\n" << Indent << " \"CondBit: "; 728 if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) { 729 CBI->printAsOperand(OS, SlotTracker); 730 OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; 731 } else { 732 CBV->printAsOperand(OS, SlotTracker); 733 OS << "\""; 734 } 735 } 736 737 bumpIndent(-2); 738 OS << "\n" << Indent << "]\n"; 739 dumpEdges(BasicBlock); 740 } 741 742 void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { 743 OS << Indent << "subgraph " << getUID(Region) << " {\n"; 744 bumpIndent(1); 745 OS << Indent << "fontname=Courier\n" 746 << Indent << "label=\"" 747 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ") 748 << DOT::EscapeString(Region->getName()) << "\"\n"; 749 // Dump the blocks of the region. 750 assert(Region->getEntry() && "Region contains no inner blocks."); 751 for (const VPBlockBase *Block : depth_first(Region->getEntry())) 752 dumpBlock(Block); 753 bumpIndent(-1); 754 OS << Indent << "}\n"; 755 dumpEdges(Region); 756 } 757 758 void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { 759 std::string IngredientString; 760 raw_string_ostream RSO(IngredientString); 761 if (auto *Inst = dyn_cast<Instruction>(V)) { 762 if (!Inst->getType()->isVoidTy()) { 763 Inst->printAsOperand(RSO, false); 764 RSO << " = "; 765 } 766 RSO << Inst->getOpcodeName() << " "; 767 unsigned E = Inst->getNumOperands(); 768 if (E > 0) { 769 Inst->getOperand(0)->printAsOperand(RSO, false); 770 for (unsigned I = 1; I < E; ++I) 771 Inst->getOperand(I)->printAsOperand(RSO << ", ", false); 772 } 773 } else // !Inst 774 V->printAsOperand(RSO, false); 775 RSO.flush(); 776 O << DOT::EscapeString(IngredientString); 777 } 778 779 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 780 VPSlotTracker &SlotTracker) const { 781 O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient); 782 } 783 784 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 785 VPSlotTracker &SlotTracker) const { 786 O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient) 787 << (InvariantCond ? " (condition is loop invariant)" : ""); 788 } 789 790 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 791 VPSlotTracker &SlotTracker) const { 792 O << "\"WIDEN\\l\""; 793 O << "\" " << VPlanIngredient(&Ingredient); 794 } 795 796 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 797 VPSlotTracker &SlotTracker) const { 798 O << "\"WIDEN-INDUCTION"; 799 if (Trunc) { 800 O << "\\l\""; 801 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 802 O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc); 803 } else 804 O << " " << VPlanIngredient(IV); 805 } 806 807 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 808 VPSlotTracker &SlotTracker) const { 809 O << "\"WIDEN-GEP "; 810 O << (IsPtrLoopInvariant ? "Inv" : "Var"); 811 size_t IndicesNumber = IsIndexLoopInvariant.size(); 812 for (size_t I = 0; I < IndicesNumber; ++I) 813 O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; 814 O << "\\l\""; 815 O << " +\n" << Indent << "\" " << VPlanIngredient(GEP); 816 } 817 818 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 819 VPSlotTracker &SlotTracker) const { 820 O << "\"WIDEN-PHI " << VPlanIngredient(Phi); 821 } 822 823 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 824 VPSlotTracker &SlotTracker) const { 825 O << "\"BLEND "; 826 Phi->printAsOperand(O, false); 827 O << " ="; 828 if (getNumIncomingValues() == 1) { 829 // Not a User of any mask: not really blending, this is a 830 // single-predecessor phi. 831 O << " "; 832 getIncomingValue(0)->printAsOperand(O, SlotTracker); 833 } else { 834 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 835 O << " "; 836 getIncomingValue(I)->printAsOperand(O, SlotTracker); 837 O << "/"; 838 getMask(I)->printAsOperand(O, SlotTracker); 839 } 840 } 841 } 842 843 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 844 VPSlotTracker &SlotTracker) const { 845 O << "\"REDUCE of" << *I << " as "; 846 ChainOp->printAsOperand(O, SlotTracker); 847 O << " + reduce("; 848 VecOp->printAsOperand(O, SlotTracker); 849 O << ")"; 850 } 851 852 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 853 VPSlotTracker &SlotTracker) const { 854 O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") 855 << VPlanIngredient(Ingredient); 856 if (AlsoPack) 857 O << " (S->V)"; 858 } 859 860 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 861 VPSlotTracker &SlotTracker) const { 862 O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst); 863 } 864 865 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 866 VPSlotTracker &SlotTracker) const { 867 O << "\"WIDEN " << VPlanIngredient(&Instr); 868 O << ", "; 869 getAddr()->printAsOperand(O, SlotTracker); 870 VPValue *Mask = getMask(); 871 if (Mask) { 872 O << ", "; 873 Mask->printAsOperand(O, SlotTracker); 874 } 875 } 876 877 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 878 Value *CanonicalIV = State.CanonicalIV; 879 Type *STy = CanonicalIV->getType(); 880 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 881 ElementCount VF = State.VF; 882 assert(!VF.isScalable() && "the code following assumes non scalables ECs"); 883 Value *VStart = VF.isScalar() 884 ? CanonicalIV 885 : Builder.CreateVectorSplat(VF.getKnownMinValue(), 886 CanonicalIV, "broadcast"); 887 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 888 SmallVector<Constant *, 8> Indices; 889 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 890 Indices.push_back( 891 ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); 892 // If VF == 1, there is only one iteration in the loop above, thus the 893 // element pushed back into Indices is ConstantInt::get(STy, Part) 894 Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); 895 // Add the consecutive indices to the vector value. 896 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 897 State.set(getVPValue(), CanonicalVectorIV, Part); 898 } 899 } 900 901 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 902 VPSlotTracker &SlotTracker) const { 903 O << "\"EMIT "; 904 getVPValue()->printAsOperand(O, SlotTracker); 905 O << " = WIDEN-CANONICAL-INDUCTION"; 906 } 907 908 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); 909 910 void VPValue::replaceAllUsesWith(VPValue *New) { 911 for (unsigned J = 0; J < getNumUsers();) { 912 VPUser *User = Users[J]; 913 unsigned NumUsers = getNumUsers(); 914 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) 915 if (User->getOperand(I) == this) 916 User->setOperand(I, New); 917 // If a user got removed after updating the current user, the next user to 918 // update will be moved to the current position, so we only need to 919 // increment the index if the number of users did not change. 920 if (NumUsers == getNumUsers()) 921 J++; 922 } 923 } 924 925 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { 926 if (const Value *UV = getUnderlyingValue()) { 927 OS << "ir<"; 928 UV->printAsOperand(OS, false); 929 OS << ">"; 930 return; 931 } 932 933 unsigned Slot = Tracker.getSlot(this); 934 if (Slot == unsigned(-1)) 935 OS << "<badref>"; 936 else 937 OS << "vp<%" << Tracker.getSlot(this) << ">"; 938 } 939 940 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, 941 Old2NewTy &Old2New, 942 InterleavedAccessInfo &IAI) { 943 ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); 944 for (VPBlockBase *Base : RPOT) { 945 visitBlock(Base, Old2New, IAI); 946 } 947 } 948 949 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 950 InterleavedAccessInfo &IAI) { 951 if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { 952 for (VPRecipeBase &VPI : *VPBB) { 953 assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); 954 auto *VPInst = cast<VPInstruction>(&VPI); 955 auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); 956 auto *IG = IAI.getInterleaveGroup(Inst); 957 if (!IG) 958 continue; 959 960 auto NewIGIter = Old2New.find(IG); 961 if (NewIGIter == Old2New.end()) 962 Old2New[IG] = new InterleaveGroup<VPInstruction>( 963 IG->getFactor(), IG->isReverse(), IG->getAlign()); 964 965 if (Inst == IG->getInsertPos()) 966 Old2New[IG]->setInsertPos(VPInst); 967 968 InterleaveGroupMap[VPInst] = Old2New[IG]; 969 InterleaveGroupMap[VPInst]->insertMember( 970 VPInst, IG->getIndex(Inst), 971 Align(IG->isReverse() ? (-1) * int(IG->getFactor()) 972 : IG->getFactor())); 973 } 974 } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 975 visitRegion(Region, Old2New, IAI); 976 else 977 llvm_unreachable("Unsupported kind of VPBlock."); 978 } 979 980 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, 981 InterleavedAccessInfo &IAI) { 982 Old2NewTy Old2New; 983 visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); 984 } 985 986 void VPSlotTracker::assignSlot(const VPValue *V) { 987 assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); 988 const Value *UV = V->getUnderlyingValue(); 989 if (UV) 990 return; 991 const auto *VPI = dyn_cast<VPInstruction>(V); 992 if (VPI && !VPI->hasResult()) 993 return; 994 995 Slots[V] = NextSlot++; 996 } 997 998 void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) { 999 if (auto *Region = dyn_cast<VPRegionBlock>(VPBB)) 1000 assignSlots(Region); 1001 else 1002 assignSlots(cast<VPBasicBlock>(VPBB)); 1003 } 1004 1005 void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { 1006 ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry()); 1007 for (const VPBlockBase *Block : RPOT) 1008 assignSlots(Block); 1009 } 1010 1011 void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { 1012 for (const VPRecipeBase &Recipe : *VPBB) { 1013 if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe)) 1014 assignSlot(VPI); 1015 else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe)) 1016 assignSlot(VPIV->getVPValue()); 1017 } 1018 } 1019 1020 void VPSlotTracker::assignSlots(const VPlan &Plan) { 1021 1022 for (const VPValue *V : Plan.VPExternalDefs) 1023 assignSlot(V); 1024 1025 for (auto &E : Plan.Value2VPValue) 1026 if (!isa<VPInstruction>(E.second)) 1027 assignSlot(E.second); 1028 1029 for (const VPValue *V : Plan.VPCBVs) 1030 assignSlot(V); 1031 1032 if (Plan.BackedgeTakenCount) 1033 assignSlot(Plan.BackedgeTakenCount); 1034 1035 ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry()); 1036 for (const VPBlockBase *Block : RPOT) 1037 assignSlots(Block); 1038 } 1039