1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This is the LLVM vectorization plan. It represents a candidate for 11 /// vectorization, allowing to plan and optimize how to vectorize a given loop 12 /// before generating LLVM-IR. 13 /// The vectorizer uses vectorization plans to estimate the costs of potential 14 /// candidates and if profitable to execute the desired plan, generating vector 15 /// LLVM-IR code. 16 /// 17 //===----------------------------------------------------------------------===// 18 19 #include "VPlan.h" 20 #include "VPlanDominatorTree.h" 21 #include "llvm/ADT/DepthFirstIterator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/Twine.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CFG.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/IR/Value.h" 33 #include "llvm/Support/Casting.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Support/GenericDomTreeConstruction.h" 38 #include "llvm/Support/GraphWriter.h" 39 #include "llvm/Support/raw_ostream.h" 40 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 41 #include <cassert> 42 #include <iterator> 43 #include <string> 44 #include <vector> 45 46 using namespace llvm; 47 extern cl::opt<bool> EnableVPlanNativePath; 48 49 #define DEBUG_TYPE "vplan" 50 51 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { 52 const VPInstruction *Instr = dyn_cast<VPInstruction>(&V); 53 VPSlotTracker SlotTracker( 54 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 55 V.print(OS, SlotTracker); 56 return OS; 57 } 58 59 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { 60 if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this)) 61 Instr->print(OS, SlotTracker); 62 else 63 printAsOperand(OS, SlotTracker); 64 } 65 66 void VPValue::dump() const { 67 const VPInstruction *Instr = dyn_cast<VPInstruction>(this); 68 VPSlotTracker SlotTracker( 69 (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); 70 print(dbgs(), SlotTracker); 71 dbgs() << "\n"; 72 } 73 74 void VPRecipeBase::dump() const { 75 VPSlotTracker SlotTracker(nullptr); 76 print(dbgs(), "", SlotTracker); 77 dbgs() << "\n"; 78 } 79 80 VPUser *VPRecipeBase::toVPUser() { 81 if (auto *U = dyn_cast<VPInstruction>(this)) 82 return U; 83 if (auto *U = dyn_cast<VPWidenRecipe>(this)) 84 return U; 85 if (auto *U = dyn_cast<VPWidenCallRecipe>(this)) 86 return U; 87 if (auto *U = dyn_cast<VPWidenSelectRecipe>(this)) 88 return U; 89 if (auto *U = dyn_cast<VPWidenGEPRecipe>(this)) 90 return U; 91 if (auto *U = dyn_cast<VPBlendRecipe>(this)) 92 return U; 93 if (auto *U = dyn_cast<VPInterleaveRecipe>(this)) 94 return U; 95 if (auto *U = dyn_cast<VPReplicateRecipe>(this)) 96 return U; 97 if (auto *U = dyn_cast<VPBranchOnMaskRecipe>(this)) 98 return U; 99 if (auto *U = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) 100 return U; 101 return nullptr; 102 } 103 104 VPValue *VPRecipeBase::toVPValue() { 105 if (auto *V = dyn_cast<VPInstruction>(this)) 106 return V; 107 if (auto *V = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) 108 return V; 109 return nullptr; 110 } 111 112 const VPValue *VPRecipeBase::toVPValue() const { 113 if (auto *V = dyn_cast<VPInstruction>(this)) 114 return V; 115 if (auto *V = dyn_cast<VPWidenMemoryInstructionRecipe>(this)) 116 return V; 117 return nullptr; 118 } 119 120 // Get the top-most entry block of \p Start. This is the entry block of the 121 // containing VPlan. This function is templated to support both const and non-const blocks 122 template <typename T> static T *getPlanEntry(T *Start) { 123 T *Next = Start; 124 T *Current = Start; 125 while ((Next = Next->getParent())) 126 Current = Next; 127 128 SmallSetVector<T *, 8> WorkList; 129 WorkList.insert(Current); 130 131 for (unsigned i = 0; i < WorkList.size(); i++) { 132 T *Current = WorkList[i]; 133 if (Current->getNumPredecessors() == 0) 134 return Current; 135 auto &Predecessors = Current->getPredecessors(); 136 WorkList.insert(Predecessors.begin(), Predecessors.end()); 137 } 138 139 llvm_unreachable("VPlan without any entry node without predecessors"); 140 } 141 142 VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; } 143 144 const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; } 145 146 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly. 147 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { 148 const VPBlockBase *Block = this; 149 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 150 Block = Region->getEntry(); 151 return cast<VPBasicBlock>(Block); 152 } 153 154 VPBasicBlock *VPBlockBase::getEntryBasicBlock() { 155 VPBlockBase *Block = this; 156 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 157 Block = Region->getEntry(); 158 return cast<VPBasicBlock>(Block); 159 } 160 161 void VPBlockBase::setPlan(VPlan *ParentPlan) { 162 assert(ParentPlan->getEntry() == this && 163 "Can only set plan on its entry block."); 164 Plan = ParentPlan; 165 } 166 167 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly. 168 const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { 169 const VPBlockBase *Block = this; 170 while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 171 Block = Region->getExit(); 172 return cast<VPBasicBlock>(Block); 173 } 174 175 VPBasicBlock *VPBlockBase::getExitBasicBlock() { 176 VPBlockBase *Block = this; 177 while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 178 Block = Region->getExit(); 179 return cast<VPBasicBlock>(Block); 180 } 181 182 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { 183 if (!Successors.empty() || !Parent) 184 return this; 185 assert(Parent->getExit() == this && 186 "Block w/o successors not the exit of its parent."); 187 return Parent->getEnclosingBlockWithSuccessors(); 188 } 189 190 VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { 191 if (!Predecessors.empty() || !Parent) 192 return this; 193 assert(Parent->getEntry() == this && 194 "Block w/o predecessors not the entry of its parent."); 195 return Parent->getEnclosingBlockWithPredecessors(); 196 } 197 198 void VPBlockBase::deleteCFG(VPBlockBase *Entry) { 199 SmallVector<VPBlockBase *, 8> Blocks; 200 201 VPValue DummyValue; 202 for (VPBlockBase *Block : depth_first(Entry)) { 203 // Drop all references in VPBasicBlocks and replace all uses with 204 // DummyValue. 205 if (auto *VPBB = dyn_cast<VPBasicBlock>(Block)) 206 VPBB->dropAllReferences(&DummyValue); 207 Blocks.push_back(Block); 208 } 209 210 for (VPBlockBase *Block : Blocks) 211 delete Block; 212 } 213 214 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { 215 iterator It = begin(); 216 while (It != end() && 217 (It->getVPRecipeID() == VPRecipeBase::VPWidenPHISC || 218 It->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC || 219 It->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC || 220 It->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC)) 221 It++; 222 return It; 223 } 224 225 BasicBlock * 226 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { 227 // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. 228 // Pred stands for Predessor. Prev stands for Previous - last visited/created. 229 BasicBlock *PrevBB = CFG.PrevBB; 230 BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), 231 PrevBB->getParent(), CFG.LastBB); 232 LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); 233 234 // Hook up the new basic block to its predecessors. 235 for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { 236 VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); 237 auto &PredVPSuccessors = PredVPBB->getSuccessors(); 238 BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; 239 240 // In outer loop vectorization scenario, the predecessor BBlock may not yet 241 // be visited(backedge). Mark the VPBasicBlock for fixup at the end of 242 // vectorization. We do not encounter this case in inner loop vectorization 243 // as we start out by building a loop skeleton with the vector loop header 244 // and latch blocks. As a result, we never enter this function for the 245 // header block in the non VPlan-native path. 246 if (!PredBB) { 247 assert(EnableVPlanNativePath && 248 "Unexpected null predecessor in non VPlan-native path"); 249 CFG.VPBBsToFix.push_back(PredVPBB); 250 continue; 251 } 252 253 assert(PredBB && "Predecessor basic-block not found building successor."); 254 auto *PredBBTerminator = PredBB->getTerminator(); 255 LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); 256 if (isa<UnreachableInst>(PredBBTerminator)) { 257 assert(PredVPSuccessors.size() == 1 && 258 "Predecessor ending w/o branch must have single successor."); 259 PredBBTerminator->eraseFromParent(); 260 BranchInst::Create(NewBB, PredBB); 261 } else { 262 assert(PredVPSuccessors.size() == 2 && 263 "Predecessor ending with branch must have two successors."); 264 unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; 265 assert(!PredBBTerminator->getSuccessor(idx) && 266 "Trying to reset an existing successor block."); 267 PredBBTerminator->setSuccessor(idx, NewBB); 268 } 269 } 270 return NewBB; 271 } 272 273 void VPBasicBlock::execute(VPTransformState *State) { 274 bool Replica = State->Instance && 275 !(State->Instance->Part == 0 && State->Instance->Lane == 0); 276 VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; 277 VPBlockBase *SingleHPred = nullptr; 278 BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. 279 280 // 1. Create an IR basic block, or reuse the last one if possible. 281 // The last IR basic block is reused, as an optimization, in three cases: 282 // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; 283 // B. when the current VPBB has a single (hierarchical) predecessor which 284 // is PrevVPBB and the latter has a single (hierarchical) successor; and 285 // C. when the current VPBB is an entry of a region replica - where PrevVPBB 286 // is the exit of this region from a previous instance, or the predecessor 287 // of this region. 288 if (PrevVPBB && /* A */ 289 !((SingleHPred = getSingleHierarchicalPredecessor()) && 290 SingleHPred->getExitBasicBlock() == PrevVPBB && 291 PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ 292 !(Replica && getPredecessors().empty())) { /* C */ 293 NewBB = createEmptyBasicBlock(State->CFG); 294 State->Builder.SetInsertPoint(NewBB); 295 // Temporarily terminate with unreachable until CFG is rewired. 296 UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 297 State->Builder.SetInsertPoint(Terminator); 298 // Register NewBB in its loop. In innermost loops its the same for all BB's. 299 Loop *L = State->LI->getLoopFor(State->CFG.LastBB); 300 L->addBasicBlockToLoop(NewBB, *State->LI); 301 State->CFG.PrevBB = NewBB; 302 } 303 304 // 2. Fill the IR basic block with IR instructions. 305 LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() 306 << " in BB:" << NewBB->getName() << '\n'); 307 308 State->CFG.VPBB2IRBB[this] = NewBB; 309 State->CFG.PrevVPBB = this; 310 311 for (VPRecipeBase &Recipe : Recipes) 312 Recipe.execute(*State); 313 314 VPValue *CBV; 315 if (EnableVPlanNativePath && (CBV = getCondBit())) { 316 Value *IRCBV = CBV->getUnderlyingValue(); 317 assert(IRCBV && "Unexpected null underlying value for condition bit"); 318 319 // Condition bit value in a VPBasicBlock is used as the branch selector. In 320 // the VPlan-native path case, since all branches are uniform we generate a 321 // branch instruction using the condition value from vector lane 0 and dummy 322 // successors. The successors are fixed later when the successor blocks are 323 // visited. 324 Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0); 325 NewCond = State->Builder.CreateExtractElement(NewCond, 326 State->Builder.getInt32(0)); 327 328 // Replace the temporary unreachable terminator with the new conditional 329 // branch. 330 auto *CurrentTerminator = NewBB->getTerminator(); 331 assert(isa<UnreachableInst>(CurrentTerminator) && 332 "Expected to replace unreachable terminator with conditional " 333 "branch."); 334 auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); 335 CondBr->setSuccessor(0, nullptr); 336 ReplaceInstWithInst(CurrentTerminator, CondBr); 337 } 338 339 LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); 340 } 341 342 void VPBasicBlock::dropAllReferences(VPValue *NewValue) { 343 for (VPRecipeBase &R : Recipes) { 344 if (auto *VPV = R.toVPValue()) 345 VPV->replaceAllUsesWith(NewValue); 346 347 if (auto *User = R.toVPUser()) 348 for (unsigned I = 0, E = User->getNumOperands(); I != E; I++) 349 User->setOperand(I, NewValue); 350 } 351 } 352 353 void VPRegionBlock::execute(VPTransformState *State) { 354 ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); 355 356 if (!isReplicator()) { 357 // Visit the VPBlocks connected to "this", starting from it. 358 for (VPBlockBase *Block : RPOT) { 359 if (EnableVPlanNativePath) { 360 // The inner loop vectorization path does not represent loop preheader 361 // and exit blocks as part of the VPlan. In the VPlan-native path, skip 362 // vectorizing loop preheader block. In future, we may replace this 363 // check with the check for loop preheader. 364 if (Block->getNumPredecessors() == 0) 365 continue; 366 367 // Skip vectorizing loop exit block. In future, we may replace this 368 // check with the check for loop exit. 369 if (Block->getNumSuccessors() == 0) 370 continue; 371 } 372 373 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 374 Block->execute(State); 375 } 376 return; 377 } 378 379 assert(!State->Instance && "Replicating a Region with non-null instance."); 380 381 // Enter replicating mode. 382 State->Instance = {0, 0}; 383 384 for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { 385 State->Instance->Part = Part; 386 assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); 387 for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; 388 ++Lane) { 389 State->Instance->Lane = Lane; 390 // Visit the VPBlocks connected to \p this, starting from it. 391 for (VPBlockBase *Block : RPOT) { 392 LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); 393 Block->execute(State); 394 } 395 } 396 } 397 398 // Exit replicating mode. 399 State->Instance.reset(); 400 } 401 402 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { 403 assert(!Parent && "Recipe already in some VPBasicBlock"); 404 assert(InsertPos->getParent() && 405 "Insertion position not in any VPBasicBlock"); 406 Parent = InsertPos->getParent(); 407 Parent->getRecipeList().insert(InsertPos->getIterator(), this); 408 } 409 410 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { 411 assert(!Parent && "Recipe already in some VPBasicBlock"); 412 assert(InsertPos->getParent() && 413 "Insertion position not in any VPBasicBlock"); 414 Parent = InsertPos->getParent(); 415 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); 416 } 417 418 void VPRecipeBase::removeFromParent() { 419 assert(getParent() && "Recipe not in any VPBasicBlock"); 420 getParent()->getRecipeList().remove(getIterator()); 421 Parent = nullptr; 422 } 423 424 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { 425 assert(getParent() && "Recipe not in any VPBasicBlock"); 426 return getParent()->getRecipeList().erase(getIterator()); 427 } 428 429 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { 430 removeFromParent(); 431 insertAfter(InsertPos); 432 } 433 434 void VPInstruction::generateInstruction(VPTransformState &State, 435 unsigned Part) { 436 IRBuilder<> &Builder = State.Builder; 437 438 if (Instruction::isBinaryOp(getOpcode())) { 439 Value *A = State.get(getOperand(0), Part); 440 Value *B = State.get(getOperand(1), Part); 441 Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); 442 State.set(this, V, Part); 443 return; 444 } 445 446 switch (getOpcode()) { 447 case VPInstruction::Not: { 448 Value *A = State.get(getOperand(0), Part); 449 Value *V = Builder.CreateNot(A); 450 State.set(this, V, Part); 451 break; 452 } 453 case VPInstruction::ICmpULE: { 454 Value *IV = State.get(getOperand(0), Part); 455 Value *TC = State.get(getOperand(1), Part); 456 Value *V = Builder.CreateICmpULE(IV, TC); 457 State.set(this, V, Part); 458 break; 459 } 460 case Instruction::Select: { 461 Value *Cond = State.get(getOperand(0), Part); 462 Value *Op1 = State.get(getOperand(1), Part); 463 Value *Op2 = State.get(getOperand(2), Part); 464 Value *V = Builder.CreateSelect(Cond, Op1, Op2); 465 State.set(this, V, Part); 466 break; 467 } 468 case VPInstruction::ActiveLaneMask: { 469 // Get first lane of vector induction variable. 470 Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); 471 // Get the original loop tripcount. 472 Value *ScalarTC = State.TripCount; 473 474 auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); 475 auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); 476 Instruction *Call = Builder.CreateIntrinsic( 477 Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, 478 {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); 479 State.set(this, Call, Part); 480 break; 481 } 482 default: 483 llvm_unreachable("Unsupported opcode for instruction"); 484 } 485 } 486 487 void VPInstruction::execute(VPTransformState &State) { 488 assert(!State.Instance && "VPInstruction executing an Instance"); 489 for (unsigned Part = 0; Part < State.UF; ++Part) 490 generateInstruction(State, Part); 491 } 492 493 void VPInstruction::print(raw_ostream &O, const Twine &Indent, 494 VPSlotTracker &SlotTracker) const { 495 O << "\"EMIT "; 496 print(O, SlotTracker); 497 } 498 499 void VPInstruction::print(raw_ostream &O) const { 500 VPSlotTracker SlotTracker(getParent()->getPlan()); 501 print(O, SlotTracker); 502 } 503 504 void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { 505 if (hasResult()) { 506 printAsOperand(O, SlotTracker); 507 O << " = "; 508 } 509 510 switch (getOpcode()) { 511 case VPInstruction::Not: 512 O << "not"; 513 break; 514 case VPInstruction::ICmpULE: 515 O << "icmp ule"; 516 break; 517 case VPInstruction::SLPLoad: 518 O << "combined load"; 519 break; 520 case VPInstruction::SLPStore: 521 O << "combined store"; 522 break; 523 case VPInstruction::ActiveLaneMask: 524 O << "active lane mask"; 525 break; 526 527 default: 528 O << Instruction::getOpcodeName(getOpcode()); 529 } 530 531 for (const VPValue *Operand : operands()) { 532 O << " "; 533 Operand->printAsOperand(O, SlotTracker); 534 } 535 } 536 537 /// Generate the code inside the body of the vectorized loop. Assumes a single 538 /// LoopVectorBody basic-block was created for this. Introduce additional 539 /// basic-blocks as needed, and fill them all. 540 void VPlan::execute(VPTransformState *State) { 541 // -1. Check if the backedge taken count is needed, and if so build it. 542 if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { 543 Value *TC = State->TripCount; 544 IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); 545 auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), 546 "trip.count.minus.1"); 547 auto VF = State->VF; 548 Value *VTCMO = 549 VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); 550 for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) 551 State->set(BackedgeTakenCount, VTCMO, Part); 552 } 553 554 // 0. Set the reverse mapping from VPValues to Values for code generation. 555 for (auto &Entry : Value2VPValue) 556 State->VPValue2Value[Entry.second] = Entry.first; 557 558 BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; 559 BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); 560 assert(VectorHeaderBB && "Loop preheader does not have a single successor."); 561 562 // 1. Make room to generate basic-blocks inside loop body if needed. 563 BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( 564 VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); 565 Loop *L = State->LI->getLoopFor(VectorHeaderBB); 566 L->addBasicBlockToLoop(VectorLatchBB, *State->LI); 567 // Remove the edge between Header and Latch to allow other connections. 568 // Temporarily terminate with unreachable until CFG is rewired. 569 // Note: this asserts the generated code's assumption that 570 // getFirstInsertionPt() can be dereferenced into an Instruction. 571 VectorHeaderBB->getTerminator()->eraseFromParent(); 572 State->Builder.SetInsertPoint(VectorHeaderBB); 573 UnreachableInst *Terminator = State->Builder.CreateUnreachable(); 574 State->Builder.SetInsertPoint(Terminator); 575 576 // 2. Generate code in loop body. 577 State->CFG.PrevVPBB = nullptr; 578 State->CFG.PrevBB = VectorHeaderBB; 579 State->CFG.LastBB = VectorLatchBB; 580 581 for (VPBlockBase *Block : depth_first(Entry)) 582 Block->execute(State); 583 584 // Setup branch terminator successors for VPBBs in VPBBsToFix based on 585 // VPBB's successors. 586 for (auto VPBB : State->CFG.VPBBsToFix) { 587 assert(EnableVPlanNativePath && 588 "Unexpected VPBBsToFix in non VPlan-native path"); 589 BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; 590 assert(BB && "Unexpected null basic block for VPBB"); 591 592 unsigned Idx = 0; 593 auto *BBTerminator = BB->getTerminator(); 594 595 for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { 596 VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); 597 BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); 598 ++Idx; 599 } 600 } 601 602 // 3. Merge the temporary latch created with the last basic-block filled. 603 BasicBlock *LastBB = State->CFG.PrevBB; 604 // Connect LastBB to VectorLatchBB to facilitate their merge. 605 assert((EnableVPlanNativePath || 606 isa<UnreachableInst>(LastBB->getTerminator())) && 607 "Expected InnerLoop VPlan CFG to terminate with unreachable"); 608 assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && 609 "Expected VPlan CFG to terminate with branch in NativePath"); 610 LastBB->getTerminator()->eraseFromParent(); 611 BranchInst::Create(VectorLatchBB, LastBB); 612 613 // Merge LastBB with Latch. 614 bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); 615 (void)Merged; 616 assert(Merged && "Could not merge last basic block with latch."); 617 VectorLatchBB = LastBB; 618 619 // We do not attempt to preserve DT for outer loop vectorization currently. 620 if (!EnableVPlanNativePath) 621 updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, 622 L->getExitBlock()); 623 } 624 625 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 626 LLVM_DUMP_METHOD 627 void VPlan::dump() const { dbgs() << *this << '\n'; } 628 #endif 629 630 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, 631 BasicBlock *LoopLatchBB, 632 BasicBlock *LoopExitBB) { 633 BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); 634 assert(LoopHeaderBB && "Loop preheader does not have a single successor."); 635 // The vector body may be more than a single basic-block by this point. 636 // Update the dominator tree information inside the vector body by propagating 637 // it from header to latch, expecting only triangular control-flow, if any. 638 BasicBlock *PostDomSucc = nullptr; 639 for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { 640 // Get the list of successors of this block. 641 std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); 642 assert(Succs.size() <= 2 && 643 "Basic block in vector loop has more than 2 successors."); 644 PostDomSucc = Succs[0]; 645 if (Succs.size() == 1) { 646 assert(PostDomSucc->getSinglePredecessor() && 647 "PostDom successor has more than one predecessor."); 648 DT->addNewBlock(PostDomSucc, BB); 649 continue; 650 } 651 BasicBlock *InterimSucc = Succs[1]; 652 if (PostDomSucc->getSingleSuccessor() == InterimSucc) { 653 PostDomSucc = Succs[1]; 654 InterimSucc = Succs[0]; 655 } 656 assert(InterimSucc->getSingleSuccessor() == PostDomSucc && 657 "One successor of a basic block does not lead to the other."); 658 assert(InterimSucc->getSinglePredecessor() && 659 "Interim successor has more than one predecessor."); 660 assert(PostDomSucc->hasNPredecessors(2) && 661 "PostDom successor has more than two predecessors."); 662 DT->addNewBlock(InterimSucc, BB); 663 DT->addNewBlock(PostDomSucc, BB); 664 } 665 // Latch block is a new dominator for the loop exit. 666 DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); 667 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 668 } 669 670 const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { 671 return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + 672 Twine(getOrCreateBID(Block)); 673 } 674 675 const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { 676 const std::string &Name = Block->getName(); 677 if (!Name.empty()) 678 return Name; 679 return "VPB" + Twine(getOrCreateBID(Block)); 680 } 681 682 void VPlanPrinter::dump() { 683 Depth = 1; 684 bumpIndent(0); 685 OS << "digraph VPlan {\n"; 686 OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; 687 if (!Plan.getName().empty()) 688 OS << "\\n" << DOT::EscapeString(Plan.getName()); 689 if (Plan.BackedgeTakenCount) { 690 OS << ", where:\\n"; 691 Plan.BackedgeTakenCount->print(OS, SlotTracker); 692 OS << " := BackedgeTakenCount"; 693 } 694 OS << "\"]\n"; 695 OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; 696 OS << "edge [fontname=Courier, fontsize=30]\n"; 697 OS << "compound=true\n"; 698 699 for (const VPBlockBase *Block : depth_first(Plan.getEntry())) 700 dumpBlock(Block); 701 702 OS << "}\n"; 703 } 704 705 void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { 706 if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block)) 707 dumpBasicBlock(BasicBlock); 708 else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 709 dumpRegion(Region); 710 else 711 llvm_unreachable("Unsupported kind of VPBlock."); 712 } 713 714 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, 715 bool Hidden, const Twine &Label) { 716 // Due to "dot" we print an edge between two regions as an edge between the 717 // exit basic block and the entry basic of the respective regions. 718 const VPBlockBase *Tail = From->getExitBasicBlock(); 719 const VPBlockBase *Head = To->getEntryBasicBlock(); 720 OS << Indent << getUID(Tail) << " -> " << getUID(Head); 721 OS << " [ label=\"" << Label << '\"'; 722 if (Tail != From) 723 OS << " ltail=" << getUID(From); 724 if (Head != To) 725 OS << " lhead=" << getUID(To); 726 if (Hidden) 727 OS << "; splines=none"; 728 OS << "]\n"; 729 } 730 731 void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { 732 auto &Successors = Block->getSuccessors(); 733 if (Successors.size() == 1) 734 drawEdge(Block, Successors.front(), false, ""); 735 else if (Successors.size() == 2) { 736 drawEdge(Block, Successors.front(), false, "T"); 737 drawEdge(Block, Successors.back(), false, "F"); 738 } else { 739 unsigned SuccessorNumber = 0; 740 for (auto *Successor : Successors) 741 drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); 742 } 743 } 744 745 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { 746 OS << Indent << getUID(BasicBlock) << " [label =\n"; 747 bumpIndent(1); 748 OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; 749 bumpIndent(1); 750 751 // Dump the block predicate. 752 const VPValue *Pred = BasicBlock->getPredicate(); 753 if (Pred) { 754 OS << " +\n" << Indent << " \"BlockPredicate: "; 755 if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) { 756 PredI->printAsOperand(OS, SlotTracker); 757 OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) 758 << ")\\l\""; 759 } else 760 Pred->printAsOperand(OS, SlotTracker); 761 } 762 763 for (const VPRecipeBase &Recipe : *BasicBlock) { 764 OS << " +\n" << Indent; 765 Recipe.print(OS, Indent, SlotTracker); 766 OS << "\\l\""; 767 } 768 769 // Dump the condition bit. 770 const VPValue *CBV = BasicBlock->getCondBit(); 771 if (CBV) { 772 OS << " +\n" << Indent << " \"CondBit: "; 773 if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) { 774 CBI->printAsOperand(OS, SlotTracker); 775 OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; 776 } else { 777 CBV->printAsOperand(OS, SlotTracker); 778 OS << "\""; 779 } 780 } 781 782 bumpIndent(-2); 783 OS << "\n" << Indent << "]\n"; 784 dumpEdges(BasicBlock); 785 } 786 787 void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { 788 OS << Indent << "subgraph " << getUID(Region) << " {\n"; 789 bumpIndent(1); 790 OS << Indent << "fontname=Courier\n" 791 << Indent << "label=\"" 792 << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ") 793 << DOT::EscapeString(Region->getName()) << "\"\n"; 794 // Dump the blocks of the region. 795 assert(Region->getEntry() && "Region contains no inner blocks."); 796 for (const VPBlockBase *Block : depth_first(Region->getEntry())) 797 dumpBlock(Block); 798 bumpIndent(-1); 799 OS << Indent << "}\n"; 800 dumpEdges(Region); 801 } 802 803 void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { 804 std::string IngredientString; 805 raw_string_ostream RSO(IngredientString); 806 if (auto *Inst = dyn_cast<Instruction>(V)) { 807 if (!Inst->getType()->isVoidTy()) { 808 Inst->printAsOperand(RSO, false); 809 RSO << " = "; 810 } 811 RSO << Inst->getOpcodeName() << " "; 812 unsigned E = Inst->getNumOperands(); 813 if (E > 0) { 814 Inst->getOperand(0)->printAsOperand(RSO, false); 815 for (unsigned I = 1; I < E; ++I) 816 Inst->getOperand(I)->printAsOperand(RSO << ", ", false); 817 } 818 } else // !Inst 819 V->printAsOperand(RSO, false); 820 RSO.flush(); 821 O << DOT::EscapeString(IngredientString); 822 } 823 824 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, 825 VPSlotTracker &SlotTracker) const { 826 O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient); 827 } 828 829 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, 830 VPSlotTracker &SlotTracker) const { 831 O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient) 832 << (InvariantCond ? " (condition is loop invariant)" : ""); 833 } 834 835 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, 836 VPSlotTracker &SlotTracker) const { 837 O << "\"WIDEN\\l\""; 838 O << "\" " << VPlanIngredient(&Ingredient); 839 } 840 841 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, 842 VPSlotTracker &SlotTracker) const { 843 O << "\"WIDEN-INDUCTION"; 844 if (Trunc) { 845 O << "\\l\""; 846 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; 847 O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc); 848 } else 849 O << " " << VPlanIngredient(IV); 850 } 851 852 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, 853 VPSlotTracker &SlotTracker) const { 854 O << "\"WIDEN-GEP "; 855 O << (IsPtrLoopInvariant ? "Inv" : "Var"); 856 size_t IndicesNumber = IsIndexLoopInvariant.size(); 857 for (size_t I = 0; I < IndicesNumber; ++I) 858 O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; 859 O << "\\l\""; 860 O << " +\n" << Indent << "\" " << VPlanIngredient(GEP); 861 } 862 863 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, 864 VPSlotTracker &SlotTracker) const { 865 O << "\"WIDEN-PHI " << VPlanIngredient(Phi); 866 } 867 868 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, 869 VPSlotTracker &SlotTracker) const { 870 O << "\"BLEND "; 871 Phi->printAsOperand(O, false); 872 O << " ="; 873 if (getNumIncomingValues() == 1) { 874 // Not a User of any mask: not really blending, this is a 875 // single-predecessor phi. 876 O << " "; 877 getIncomingValue(0)->printAsOperand(O, SlotTracker); 878 } else { 879 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { 880 O << " "; 881 getIncomingValue(I)->printAsOperand(O, SlotTracker); 882 O << "/"; 883 getMask(I)->printAsOperand(O, SlotTracker); 884 } 885 } 886 } 887 888 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, 889 VPSlotTracker &SlotTracker) const { 890 O << "\"REDUCE of" << *I << " as "; 891 ChainOp->printAsOperand(O, SlotTracker); 892 O << " + reduce("; 893 VecOp->printAsOperand(O, SlotTracker); 894 if (CondOp) { 895 O << ", "; 896 CondOp->printAsOperand(O, SlotTracker); 897 } 898 O << ")"; 899 } 900 901 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, 902 VPSlotTracker &SlotTracker) const { 903 O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") 904 << VPlanIngredient(Ingredient); 905 if (AlsoPack) 906 O << " (S->V)"; 907 } 908 909 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, 910 VPSlotTracker &SlotTracker) const { 911 O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst); 912 } 913 914 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, 915 VPSlotTracker &SlotTracker) const { 916 O << "\"WIDEN " 917 << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; 918 919 bool First = true; 920 for (VPValue *Op : operands()) { 921 if (!First) 922 O << ", "; 923 Op->printAsOperand(O, SlotTracker); 924 First = false; 925 } 926 } 927 928 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { 929 Value *CanonicalIV = State.CanonicalIV; 930 Type *STy = CanonicalIV->getType(); 931 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); 932 ElementCount VF = State.VF; 933 assert(!VF.isScalable() && "the code following assumes non scalables ECs"); 934 Value *VStart = VF.isScalar() 935 ? CanonicalIV 936 : Builder.CreateVectorSplat(VF.getKnownMinValue(), 937 CanonicalIV, "broadcast"); 938 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { 939 SmallVector<Constant *, 8> Indices; 940 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 941 Indices.push_back( 942 ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); 943 // If VF == 1, there is only one iteration in the loop above, thus the 944 // element pushed back into Indices is ConstantInt::get(STy, Part) 945 Constant *VStep = 946 VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); 947 // Add the consecutive indices to the vector value. 948 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); 949 State.set(getVPValue(), CanonicalVectorIV, Part); 950 } 951 } 952 953 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, 954 VPSlotTracker &SlotTracker) const { 955 O << "\"EMIT "; 956 getVPValue()->printAsOperand(O, SlotTracker); 957 O << " = WIDEN-CANONICAL-INDUCTION"; 958 } 959 960 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); 961 962 void VPValue::replaceAllUsesWith(VPValue *New) { 963 for (unsigned J = 0; J < getNumUsers();) { 964 VPUser *User = Users[J]; 965 unsigned NumUsers = getNumUsers(); 966 for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) 967 if (User->getOperand(I) == this) 968 User->setOperand(I, New); 969 // If a user got removed after updating the current user, the next user to 970 // update will be moved to the current position, so we only need to 971 // increment the index if the number of users did not change. 972 if (NumUsers == getNumUsers()) 973 J++; 974 } 975 } 976 977 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { 978 if (const Value *UV = getUnderlyingValue()) { 979 OS << "ir<"; 980 UV->printAsOperand(OS, false); 981 OS << ">"; 982 return; 983 } 984 985 unsigned Slot = Tracker.getSlot(this); 986 if (Slot == unsigned(-1)) 987 OS << "<badref>"; 988 else 989 OS << "vp<%" << Tracker.getSlot(this) << ">"; 990 } 991 992 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, 993 Old2NewTy &Old2New, 994 InterleavedAccessInfo &IAI) { 995 ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry()); 996 for (VPBlockBase *Base : RPOT) { 997 visitBlock(Base, Old2New, IAI); 998 } 999 } 1000 1001 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, 1002 InterleavedAccessInfo &IAI) { 1003 if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { 1004 for (VPRecipeBase &VPI : *VPBB) { 1005 assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); 1006 auto *VPInst = cast<VPInstruction>(&VPI); 1007 auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); 1008 auto *IG = IAI.getInterleaveGroup(Inst); 1009 if (!IG) 1010 continue; 1011 1012 auto NewIGIter = Old2New.find(IG); 1013 if (NewIGIter == Old2New.end()) 1014 Old2New[IG] = new InterleaveGroup<VPInstruction>( 1015 IG->getFactor(), IG->isReverse(), IG->getAlign()); 1016 1017 if (Inst == IG->getInsertPos()) 1018 Old2New[IG]->setInsertPos(VPInst); 1019 1020 InterleaveGroupMap[VPInst] = Old2New[IG]; 1021 InterleaveGroupMap[VPInst]->insertMember( 1022 VPInst, IG->getIndex(Inst), 1023 Align(IG->isReverse() ? (-1) * int(IG->getFactor()) 1024 : IG->getFactor())); 1025 } 1026 } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) 1027 visitRegion(Region, Old2New, IAI); 1028 else 1029 llvm_unreachable("Unsupported kind of VPBlock."); 1030 } 1031 1032 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, 1033 InterleavedAccessInfo &IAI) { 1034 Old2NewTy Old2New; 1035 visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); 1036 } 1037 1038 void VPSlotTracker::assignSlot(const VPValue *V) { 1039 assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); 1040 const Value *UV = V->getUnderlyingValue(); 1041 if (UV) 1042 return; 1043 const auto *VPI = dyn_cast<VPInstruction>(V); 1044 if (VPI && !VPI->hasResult()) 1045 return; 1046 1047 Slots[V] = NextSlot++; 1048 } 1049 1050 void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) { 1051 if (auto *Region = dyn_cast<VPRegionBlock>(VPBB)) 1052 assignSlots(Region); 1053 else 1054 assignSlots(cast<VPBasicBlock>(VPBB)); 1055 } 1056 1057 void VPSlotTracker::assignSlots(const VPRegionBlock *Region) { 1058 ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry()); 1059 for (const VPBlockBase *Block : RPOT) 1060 assignSlots(Block); 1061 } 1062 1063 void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) { 1064 for (const VPRecipeBase &Recipe : *VPBB) { 1065 if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe)) 1066 assignSlot(VPI); 1067 else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe)) 1068 assignSlot(VPIV->getVPValue()); 1069 } 1070 } 1071 1072 void VPSlotTracker::assignSlots(const VPlan &Plan) { 1073 1074 for (const VPValue *V : Plan.VPExternalDefs) 1075 assignSlot(V); 1076 1077 for (auto &E : Plan.Value2VPValue) 1078 if (!isa<VPInstruction>(E.second)) 1079 assignSlot(E.second); 1080 1081 for (const VPValue *V : Plan.VPCBVs) 1082 assignSlot(V); 1083 1084 if (Plan.BackedgeTakenCount) 1085 assignSlot(Plan.BackedgeTakenCount); 1086 1087 ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry()); 1088 for (const VPBlockBase *Block : RPOT) 1089 assignSlots(Block); 1090 } 1091