1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include <list> 38 39 #define LDIST_NAME "loop-distribute" 40 #define DEBUG_TYPE LDIST_NAME 41 42 using namespace llvm; 43 44 static cl::opt<bool> 45 LDistVerify("loop-distribute-verify", cl::Hidden, 46 cl::desc("Turn on DominatorTree and LoopInfo verification " 47 "after Loop Distribution"), 48 cl::init(false)); 49 50 static cl::opt<bool> DistributeNonIfConvertible( 51 "loop-distribute-non-if-convertible", cl::Hidden, 52 cl::desc("Whether to distribute into a loop that may not be " 53 "if-convertible by the loop vectorizer"), 54 cl::init(false)); 55 56 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 57 58 /// \brief Remaps instructions in a loop including the preheader. 59 static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks, 60 ValueToValueMapTy &VMap) { 61 // Rewrite the code to refer to itself. 62 for (auto *BB : Blocks) 63 for (auto &Inst : *BB) 64 RemapInstruction(&Inst, VMap, 65 RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); 66 } 67 68 /// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p 69 /// Blocks. 70 /// 71 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block 72 /// \p LoopDomBB. Insert the new blocks before block specified in \p Before. 73 static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, 74 Loop *OrigLoop, ValueToValueMapTy &VMap, 75 const Twine &NameSuffix, LoopInfo *LI, 76 DominatorTree *DT, 77 SmallVectorImpl<BasicBlock *> &Blocks) { 78 Function *F = OrigLoop->getHeader()->getParent(); 79 Loop *ParentLoop = OrigLoop->getParentLoop(); 80 81 Loop *NewLoop = new Loop(); 82 if (ParentLoop) 83 ParentLoop->addChildLoop(NewLoop); 84 else 85 LI->addTopLevelLoop(NewLoop); 86 87 BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); 88 BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); 89 // To rename the loop PHIs. 90 VMap[OrigPH] = NewPH; 91 Blocks.push_back(NewPH); 92 93 // Update LoopInfo. 94 if (ParentLoop) 95 ParentLoop->addBasicBlockToLoop(NewPH, *LI); 96 97 // Update DominatorTree. 98 DT->addNewBlock(NewPH, LoopDomBB); 99 100 for (BasicBlock *BB : OrigLoop->getBlocks()) { 101 BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); 102 VMap[BB] = NewBB; 103 104 // Update LoopInfo. 105 NewLoop->addBasicBlockToLoop(NewBB, *LI); 106 107 // Update DominatorTree. 108 BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); 109 DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); 110 111 Blocks.push_back(NewBB); 112 } 113 114 // Move them physically from the end of the block list. 115 F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); 116 F->getBasicBlockList().splice(Before, F->getBasicBlockList(), 117 NewLoop->getHeader(), F->end()); 118 119 return NewLoop; 120 } 121 122 namespace { 123 /// \brief Maintains the set of instructions of the loop for a partition before 124 /// cloning. After cloning, it hosts the new loop. 125 class InstPartition { 126 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 127 128 public: 129 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 130 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 131 Set.insert(I); 132 } 133 134 /// \brief Returns whether this partition contains a dependence cycle. 135 bool hasDepCycle() const { return DepCycle; } 136 137 /// \brief Adds an instruction to this partition. 138 void add(Instruction *I) { Set.insert(I); } 139 140 /// \brief Collection accessors. 141 InstructionSet::iterator begin() { return Set.begin(); } 142 InstructionSet::iterator end() { return Set.end(); } 143 InstructionSet::const_iterator begin() const { return Set.begin(); } 144 InstructionSet::const_iterator end() const { return Set.end(); } 145 bool empty() const { return Set.empty(); } 146 147 /// \brief Moves this partition into \p Other. This partition becomes empty 148 /// after this. 149 void moveTo(InstPartition &Other) { 150 Other.Set.insert(Set.begin(), Set.end()); 151 Set.clear(); 152 Other.DepCycle |= DepCycle; 153 } 154 155 /// \brief Populates the partition with a transitive closure of all the 156 /// instructions that the seeded instructions dependent on. 157 void populateUsedSet() { 158 // FIXME: We currently don't use control-dependence but simply include all 159 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 160 // up. 161 for (auto *B : OrigLoop->getBlocks()) 162 Set.insert(B->getTerminator()); 163 164 // Follow the use-def chains to form a transitive closure of all the 165 // instructions that the originally seeded instructions depend on. 166 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 167 while (!Worklist.empty()) { 168 Instruction *I = Worklist.pop_back_val(); 169 // Insert instructions from the loop that we depend on. 170 for (Value *V : I->operand_values()) { 171 auto *I = dyn_cast<Instruction>(V); 172 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 173 Worklist.push_back(I); 174 } 175 } 176 } 177 178 /// \brief Clones the original loop. 179 /// 180 /// Updates LoopInfo and DominatorTree using the information that block \p 181 /// LoopDomBB dominates the loop. 182 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 183 unsigned Index, LoopInfo *LI, 184 DominatorTree *DT) { 185 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 186 VMap, Twine(".ldist") + Twine(Index), 187 LI, DT, ClonedLoopBlocks); 188 return ClonedLoop; 189 } 190 191 /// \brief The cloned loop. If this partition is mapped to the original loop, 192 /// this is null. 193 const Loop *getClonedLoop() const { return ClonedLoop; } 194 195 /// \brief Returns the loop where this partition ends up after distribution. 196 /// If this partition is mapped to the original loop then use the block from 197 /// the loop. 198 const Loop *getDistributedLoop() const { 199 return ClonedLoop ? ClonedLoop : OrigLoop; 200 } 201 202 /// \brief The VMap that is populated by cloning and then used in 203 /// remapinstruction to remap the cloned instructions. 204 ValueToValueMapTy &getVMap() { return VMap; } 205 206 /// \brief Remaps the cloned instructions using VMap. 207 void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); } 208 209 /// \brief Based on the set of instructions selected for this partition, 210 /// removes the unnecessary ones. 211 void removeUnusedInsts() { 212 SmallVector<Instruction *, 8> Unused; 213 214 for (auto *Block : OrigLoop->getBlocks()) 215 for (auto &Inst : *Block) 216 if (!Set.count(&Inst)) { 217 Instruction *NewInst = &Inst; 218 if (!VMap.empty()) 219 NewInst = cast<Instruction>(VMap[NewInst]); 220 221 assert(!isa<BranchInst>(NewInst) && 222 "Branches are marked used early on"); 223 Unused.push_back(NewInst); 224 } 225 226 // Delete the instructions backwards, as it has a reduced likelihood of 227 // having to update as many def-use and use-def chains. 228 for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { 229 auto *Inst = *I; 230 231 if (!Inst->use_empty()) 232 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 233 Inst->eraseFromParent(); 234 } 235 } 236 237 void print() const { 238 if (DepCycle) 239 dbgs() << " (cycle)\n"; 240 for (auto *I : Set) 241 // Prefix with the block name. 242 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 243 } 244 245 void printBlocks() const { 246 for (auto *BB : getDistributedLoop()->getBlocks()) 247 dbgs() << *BB; 248 } 249 250 private: 251 /// \brief Instructions from OrigLoop selected for this partition. 252 InstructionSet Set; 253 254 /// \brief Whether this partition contains a dependence cycle. 255 bool DepCycle; 256 257 /// \brief The original loop. 258 Loop *OrigLoop; 259 260 /// \brief The cloned loop. If this partition is mapped to the original loop, 261 /// this is null. 262 Loop *ClonedLoop; 263 264 /// \brief The blocks of ClonedLoop including the preheader. If this 265 /// partition is mapped to the original loop, this is empty. 266 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 267 268 /// \brief These gets populated once the set of instructions have been 269 /// finalized. If this partition is mapped to the original loop, these are not 270 /// set. 271 ValueToValueMapTy VMap; 272 }; 273 274 /// \brief Holds the set of Partitions. It populates them, merges them and then 275 /// clones the loops. 276 class InstPartitionContainer { 277 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 278 279 public: 280 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 281 : L(L), LI(LI), DT(DT) {} 282 283 /// \brief Returns the number of partitions. 284 unsigned getSize() const { return PartitionContainer.size(); } 285 286 /// \brief Adds \p Inst into the current partition if that is marked to 287 /// contain cycles. Otherwise start a new partition for it. 288 void addToCyclicPartition(Instruction *Inst) { 289 // If the current partition is non-cyclic. Start a new one. 290 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 291 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 292 else 293 PartitionContainer.back().add(Inst); 294 } 295 296 /// \brief Adds \p Inst into a partition that is not marked to contain 297 /// dependence cycles. 298 /// 299 // Initially we isolate memory instructions into as many partitions as 300 // possible, then later we may merge them back together. 301 void addToNewNonCyclicPartition(Instruction *Inst) { 302 PartitionContainer.emplace_back(Inst, L); 303 } 304 305 /// \brief Merges adjacent non-cyclic partitions. 306 /// 307 /// The idea is that we currently only want to isolate the non-vectorizable 308 /// partition. We could later allow more distribution among these partition 309 /// too. 310 void mergeAdjacentNonCyclic() { 311 mergeAdjacentPartitionsIf( 312 [](const InstPartition *P) { return !P->hasDepCycle(); }); 313 } 314 315 /// \brief If a partition contains only conditional stores, we won't vectorize 316 /// it. Try to merge it with a previous cyclic partition. 317 void mergeNonIfConvertible() { 318 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 319 if (Partition->hasDepCycle()) 320 return true; 321 322 // Now, check if all stores are conditional in this partition. 323 bool seenStore = false; 324 325 for (auto *Inst : *Partition) 326 if (isa<StoreInst>(Inst)) { 327 seenStore = true; 328 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 329 return false; 330 } 331 return seenStore; 332 }); 333 } 334 335 /// \brief Merges the partitions according to various heuristics. 336 void mergeBeforePopulating() { 337 mergeAdjacentNonCyclic(); 338 if (!DistributeNonIfConvertible) 339 mergeNonIfConvertible(); 340 } 341 342 /// \brief Merges partitions in order to ensure that no loads are duplicated. 343 /// 344 /// We can't duplicate loads because that could potentially reorder them. 345 /// LoopAccessAnalysis provides dependency information with the context that 346 /// the order of memory operation is preserved. 347 /// 348 /// Return if any partitions were merged. 349 bool mergeToAvoidDuplicatedLoads() { 350 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 351 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 352 353 LoadToPartitionT LoadToPartition; 354 ToBeMergedT ToBeMerged; 355 356 // Step through the partitions and create equivalence between partitions 357 // that contain the same load. Also put partitions in between them in the 358 // same equivalence class to avoid reordering of memory operations. 359 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 360 E = PartitionContainer.end(); 361 I != E; ++I) { 362 auto *PartI = &*I; 363 364 // If a load occurs in two partitions PartI and PartJ, merge all 365 // partitions (PartI, PartJ] into PartI. 366 for (Instruction *Inst : *PartI) 367 if (isa<LoadInst>(Inst)) { 368 bool NewElt; 369 LoadToPartitionT::iterator LoadToPart; 370 371 std::tie(LoadToPart, NewElt) = 372 LoadToPartition.insert(std::make_pair(Inst, PartI)); 373 if (!NewElt) { 374 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 375 << "partitions: " << PartI << ", " 376 << LoadToPart->second << "\n" << *Inst << "\n"); 377 378 auto PartJ = I; 379 do { 380 --PartJ; 381 ToBeMerged.unionSets(PartI, &*PartJ); 382 } while (&*PartJ != LoadToPart->second); 383 } 384 } 385 } 386 if (ToBeMerged.empty()) 387 return false; 388 389 // Merge the member of an equivalence class into its class leader. This 390 // makes the members empty. 391 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 392 I != E; ++I) { 393 if (!I->isLeader()) 394 continue; 395 396 auto PartI = I->getData(); 397 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 398 ToBeMerged.member_end())) { 399 PartJ->moveTo(*PartI); 400 } 401 } 402 403 // Remove the empty partitions. 404 PartitionContainer.remove_if( 405 [](const InstPartition &P) { return P.empty(); }); 406 407 return true; 408 } 409 410 /// \brief Sets up the mapping between instructions to partitions. If the 411 /// instruction is duplicated across multiple partitions, set the entry to -1. 412 void setupPartitionIdOnInstructions() { 413 int PartitionID = 0; 414 for (const auto &Partition : PartitionContainer) { 415 for (Instruction *Inst : Partition) { 416 bool NewElt; 417 InstToPartitionIdT::iterator Iter; 418 419 std::tie(Iter, NewElt) = 420 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 421 if (!NewElt) 422 Iter->second = -1; 423 } 424 ++PartitionID; 425 } 426 } 427 428 /// \brief Populates the partition with everything that the seeding 429 /// instructions require. 430 void populateUsedSet() { 431 for (auto &P : PartitionContainer) 432 P.populateUsedSet(); 433 } 434 435 /// \brief This performs the main chunk of the work of cloning the loops for 436 /// the partitions. 437 void cloneLoops(Pass *P) { 438 BasicBlock *OrigPH = L->getLoopPreheader(); 439 // At this point the predecessor of the preheader is either the memcheck 440 // block or the top part of the original preheader. 441 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 442 assert(Pred && "Preheader does not have a single predecessor"); 443 BasicBlock *ExitBlock = L->getExitBlock(); 444 assert(ExitBlock && "No single exit block"); 445 Loop *NewLoop; 446 447 assert(!PartitionContainer.empty() && "at least two partitions expected"); 448 // We're cloning the preheader along with the loop so we already made sure 449 // it was empty. 450 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 451 "preheader not empty"); 452 453 // Create a loop for each partition except the last. Clone the original 454 // loop before PH along with adding a preheader for the cloned loop. Then 455 // update PH to point to the newly added preheader. 456 BasicBlock *TopPH = OrigPH; 457 unsigned Index = getSize() - 1; 458 for (auto I = std::next(PartitionContainer.rbegin()), 459 E = PartitionContainer.rend(); 460 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 461 auto *Part = &*I; 462 463 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 464 465 Part->getVMap()[ExitBlock] = TopPH; 466 Part->remapInstructions(); 467 } 468 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 469 470 // Now go in forward order and update the immediate dominator for the 471 // preheaders with the exiting block of the previous loop. Dominance 472 // within the loop is updated in cloneLoopWithPreheader. 473 for (auto Curr = PartitionContainer.cbegin(), 474 Next = std::next(PartitionContainer.cbegin()), 475 E = PartitionContainer.cend(); 476 Next != E; ++Curr, ++Next) 477 DT->changeImmediateDominator( 478 Next->getDistributedLoop()->getLoopPreheader(), 479 Curr->getDistributedLoop()->getExitingBlock()); 480 } 481 482 /// \brief Removes the dead instructions from the cloned loops. 483 void removeUnusedInsts() { 484 for (auto &Partition : PartitionContainer) 485 Partition.removeUnusedInsts(); 486 } 487 488 /// \brief For each memory pointer, it computes the partitionId the pointer is 489 /// used in. 490 /// 491 /// This returns an array of int where the I-th entry corresponds to I-th 492 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 493 /// partitions its entry is set to -1. 494 SmallVector<int, 8> 495 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 496 const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = 497 LAI.getRuntimePointerCheck(); 498 499 unsigned N = RtPtrCheck->Pointers.size(); 500 SmallVector<int, 8> PtrToPartitions(N); 501 for (unsigned I = 0; I < N; ++I) { 502 Value *Ptr = RtPtrCheck->Pointers[I]; 503 auto Instructions = 504 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); 505 506 int &Partition = PtrToPartitions[I]; 507 // First set it to uninitialized. 508 Partition = -2; 509 for (Instruction *Inst : Instructions) { 510 // Note that this could be -1 if Inst is duplicated across multiple 511 // partitions. 512 int ThisPartition = this->InstToPartitionId[Inst]; 513 if (Partition == -2) 514 Partition = ThisPartition; 515 // -1 means belonging to multiple partitions. 516 else if (Partition == -1) 517 break; 518 else if (Partition != (int)ThisPartition) 519 Partition = -1; 520 } 521 assert(Partition != -2 && "Pointer not belonging to any partition"); 522 } 523 524 return PtrToPartitions; 525 } 526 527 void print(raw_ostream &OS) const { 528 unsigned Index = 0; 529 for (const auto &P : PartitionContainer) { 530 OS << "Partition " << Index++ << " (" << &P << "):\n"; 531 P.print(); 532 } 533 } 534 535 void dump() const { print(dbgs()); } 536 537 #ifndef NDEBUG 538 friend raw_ostream &operator<<(raw_ostream &OS, 539 const InstPartitionContainer &Partitions) { 540 Partitions.print(OS); 541 return OS; 542 } 543 #endif 544 545 void printBlocks() const { 546 unsigned Index = 0; 547 for (const auto &P : PartitionContainer) { 548 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 549 P.printBlocks(); 550 } 551 } 552 553 private: 554 typedef std::list<InstPartition> PartitionContainerT; 555 556 /// \brief List of partitions. 557 PartitionContainerT PartitionContainer; 558 559 /// \brief Mapping from Instruction to partition Id. If the instruction 560 /// belongs to multiple partitions the entry contains -1. 561 InstToPartitionIdT InstToPartitionId; 562 563 Loop *L; 564 LoopInfo *LI; 565 DominatorTree *DT; 566 567 /// \brief The control structure to merge adjacent partitions if both satisfy 568 /// the \p Predicate. 569 template <class UnaryPredicate> 570 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 571 InstPartition *PrevMatch = nullptr; 572 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 573 auto DoesMatch = Predicate(&*I); 574 if (PrevMatch == nullptr && DoesMatch) { 575 PrevMatch = &*I; 576 ++I; 577 } else if (PrevMatch != nullptr && DoesMatch) { 578 I->moveTo(*PrevMatch); 579 I = PartitionContainer.erase(I); 580 } else { 581 PrevMatch = nullptr; 582 ++I; 583 } 584 } 585 } 586 }; 587 588 /// \brief For each memory instruction, this class maintains difference of the 589 /// number of unsafe dependences that start out from this instruction minus 590 /// those that end here. 591 /// 592 /// By traversing the memory instructions in program order and accumulating this 593 /// number, we know whether any unsafe dependence crosses over a program point. 594 class MemoryInstructionDependences { 595 typedef MemoryDepChecker::Dependence Dependence; 596 597 public: 598 struct Entry { 599 Instruction *Inst; 600 unsigned NumUnsafeDependencesStartOrEnd; 601 602 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 603 }; 604 605 typedef SmallVector<Entry, 8> AccessesType; 606 607 AccessesType::const_iterator begin() const { return Accesses.begin(); } 608 AccessesType::const_iterator end() const { return Accesses.end(); } 609 610 MemoryInstructionDependences( 611 const SmallVectorImpl<Instruction *> &Instructions, 612 const SmallVectorImpl<Dependence> &InterestingDependences) { 613 Accesses.append(Instructions.begin(), Instructions.end()); 614 615 DEBUG(dbgs() << "Backward dependences:\n"); 616 for (auto &Dep : InterestingDependences) 617 if (Dep.isPossiblyBackward()) { 618 // Note that the designations source and destination follow the program 619 // order, i.e. source is always first. (The direction is given by the 620 // DepType.) 621 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 622 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 623 624 DEBUG(Dep.print(dbgs(), 2, Instructions)); 625 } 626 } 627 628 private: 629 AccessesType Accesses; 630 }; 631 632 /// \brief Handles the loop versioning based on memchecks. 633 class RuntimeCheckEmitter { 634 public: 635 RuntimeCheckEmitter(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, 636 DominatorTree *DT) 637 : OrigLoop(L), NonDistributedLoop(nullptr), LAI(LAI), LI(LI), DT(DT) {} 638 639 /// \brief Given the \p Partitions formed by Loop Distribution, it determines 640 /// in which partition each pointer is used. 641 void partitionPointers(InstPartitionContainer &Partitions) { 642 // Set up partition id in PtrRtChecks. Ptr -> Access -> Intruction -> 643 // Partition. 644 PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 645 646 DEBUG(dbgs() << "\nPointers:\n"); 647 DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition)); 648 } 649 650 /// \brief Returns true if we need memchecks to distribute the loop. 651 bool needsRuntimeChecks() const { 652 return LAI.getRuntimePointerCheck()->needsAnyChecking(&PtrToPartition); 653 } 654 655 /// \brief Performs the CFG manipulation part of versioning the loop including 656 /// the DominatorTree and LoopInfo updates. 657 void versionLoop(Pass *P) { 658 Instruction *FirstCheckInst; 659 Instruction *MemRuntimeCheck; 660 // Add the memcheck in the original preheader (this is empty initially). 661 BasicBlock *MemCheckBB = OrigLoop->getLoopPreheader(); 662 std::tie(FirstCheckInst, MemRuntimeCheck) = 663 LAI.addRuntimeCheck(MemCheckBB->getTerminator(), &PtrToPartition); 664 assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); 665 666 // Rename the block to make the IR more readable. 667 MemCheckBB->setName(OrigLoop->getHeader()->getName() + ".ldist.memcheck"); 668 669 // Create empty preheader for the loop (and after cloning for the 670 // original/nondist loop). 671 BasicBlock *PH = 672 SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); 673 PH->setName(OrigLoop->getHeader()->getName() + ".ph"); 674 675 // Clone the loop including the preheader. 676 // 677 // FIXME: This does not currently preserve SimplifyLoop because the exit 678 // block is a join between the two loops. 679 SmallVector<BasicBlock *, 8> NonDistributedLoopBlocks; 680 NonDistributedLoop = 681 cloneLoopWithPreheader(PH, MemCheckBB, OrigLoop, VMap, ".ldist.nondist", 682 LI, DT, NonDistributedLoopBlocks); 683 remapInstructionsInLoop(NonDistributedLoopBlocks, VMap); 684 685 // Insert the conditional branch based on the result of the memchecks. 686 Instruction *OrigTerm = MemCheckBB->getTerminator(); 687 BranchInst::Create(NonDistributedLoop->getLoopPreheader(), 688 OrigLoop->getLoopPreheader(), MemRuntimeCheck, OrigTerm); 689 OrigTerm->eraseFromParent(); 690 691 // The loops merge in the original exit block. This is now dominated by the 692 // memchecking block. 693 DT->changeImmediateDominator(OrigLoop->getExitBlock(), MemCheckBB); 694 } 695 696 /// \brief Adds the necessary PHI nodes for the versioned loops based on the 697 /// loop-defined values used outside of the loop. 698 void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) { 699 BasicBlock *PHIBlock = OrigLoop->getExitBlock(); 700 assert(PHIBlock && "No single successor to loop exit block"); 701 702 for (auto *Inst : DefsUsedOutside) { 703 auto *NonDistInst = cast<Instruction>(VMap[Inst]); 704 PHINode *PN; 705 706 // First see if we have a single-operand PHI with the value defined by the 707 // original loop. 708 for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { 709 assert(PN->getNumOperands() == 1 && 710 "Exit block should only have on predecessor"); 711 if (PN->getIncomingValue(0) == Inst) 712 break; 713 } 714 // If not create it. 715 if (!PN) { 716 PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".ldist", 717 PHIBlock->begin()); 718 for (auto *User : Inst->users()) 719 if (!OrigLoop->contains(cast<Instruction>(User)->getParent())) 720 User->replaceUsesOfWith(Inst, PN); 721 PN->addIncoming(Inst, OrigLoop->getExitingBlock()); 722 } 723 // Add the new incoming value from the non-distributed loop. 724 PN->addIncoming(NonDistInst, NonDistributedLoop->getExitingBlock()); 725 } 726 } 727 728 private: 729 /// \brief The original loop. This becomes the "versioned" one, i.e. control 730 /// goes if the memchecks all pass. 731 Loop *OrigLoop; 732 /// \brief The fall-back loop, i.e. if any of the memchecks fail. 733 Loop *NonDistributedLoop; 734 735 /// \brief For each memory pointer it contains the partitionId it is used in. 736 /// 737 /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck(). 738 /// If the pointer is used in multiple partitions the entry is set to -1. 739 SmallVector<int, 8> PtrToPartition; 740 741 /// \brief This maps the instructions from OrigLoop to their counterpart in 742 /// NonDistributedLoop. 743 ValueToValueMapTy VMap; 744 745 /// \brief Analyses used. 746 const LoopAccessInfo &LAI; 747 LoopInfo *LI; 748 DominatorTree *DT; 749 }; 750 751 /// \brief Returns the instructions that use values defined in the loop. 752 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { 753 SmallVector<Instruction *, 8> UsedOutside; 754 755 for (auto *Block : L->getBlocks()) 756 // FIXME: I believe that this could use copy_if if the Inst reference could 757 // be adapted into a pointer. 758 for (auto &Inst : *Block) { 759 auto Users = Inst.users(); 760 if (std::any_of(Users.begin(), Users.end(), [&](User *U) { 761 auto *Use = cast<Instruction>(U); 762 return !L->contains(Use->getParent()); 763 })) 764 UsedOutside.push_back(&Inst); 765 } 766 767 return UsedOutside; 768 } 769 770 /// \brief The pass class. 771 class LoopDistribute : public FunctionPass { 772 public: 773 LoopDistribute() : FunctionPass(ID) { 774 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 775 } 776 777 bool runOnFunction(Function &F) override { 778 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 779 LAA = &getAnalysis<LoopAccessAnalysis>(); 780 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 781 782 // Build up a worklist of inner-loops to vectorize. This is necessary as the 783 // act of distributing a loop creates new loops and can invalidate iterators 784 // across the loops. 785 SmallVector<Loop *, 8> Worklist; 786 787 for (Loop *TopLevelLoop : *LI) 788 for (Loop *L : depth_first(TopLevelLoop)) 789 // We only handle inner-most loops. 790 if (L->empty()) 791 Worklist.push_back(L); 792 793 // Now walk the identified inner loops. 794 bool Changed = false; 795 for (Loop *L : Worklist) 796 Changed |= processLoop(L); 797 798 // Process each loop nest in the function. 799 return Changed; 800 } 801 802 void getAnalysisUsage(AnalysisUsage &AU) const override { 803 AU.addRequired<LoopInfoWrapperPass>(); 804 AU.addPreserved<LoopInfoWrapperPass>(); 805 AU.addRequired<LoopAccessAnalysis>(); 806 AU.addRequired<DominatorTreeWrapperPass>(); 807 AU.addPreserved<DominatorTreeWrapperPass>(); 808 } 809 810 static char ID; 811 812 private: 813 /// \brief Try to distribute an inner-most loop. 814 bool processLoop(Loop *L) { 815 assert(L->empty() && "Only process inner loops."); 816 817 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 818 << "\" checking " << *L << "\n"); 819 820 BasicBlock *PH = L->getLoopPreheader(); 821 if (!PH) { 822 DEBUG(dbgs() << "Skipping; no preheader"); 823 return false; 824 } 825 if (!L->getExitBlock()) { 826 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 827 return false; 828 } 829 // LAA will check that we only have a single exiting block. 830 831 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 832 833 // Currently, we only distribute to isolate the part of the loop with 834 // dependence cycles to enable partial vectorization. 835 if (LAI.canVectorizeMemory()) { 836 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 837 return false; 838 } 839 auto *InterestingDependences = 840 LAI.getDepChecker().getInterestingDependences(); 841 if (!InterestingDependences || InterestingDependences->empty()) { 842 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 843 return false; 844 } 845 846 InstPartitionContainer Partitions(L, LI, DT); 847 848 // First, go through each memory operation and assign them to consecutive 849 // partitions (the order of partitions follows program order). Put those 850 // with unsafe dependences into "cyclic" partition otherwise put each store 851 // in its own "non-cyclic" partition (we'll merge these later). 852 // 853 // Note that a memory operation (e.g. Load2 below) at a program point that 854 // has an unsafe dependence (Store3->Load1) spanning over it must be 855 // included in the same cyclic partition as the dependent operations. This 856 // is to preserve the original program order after distribution. E.g.: 857 // 858 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 859 // Load1 -. 1 0->1 860 // Load2 | /Unsafe/ 0 1 861 // Store3 -' -1 1->0 862 // Load4 0 0 863 // 864 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 865 // we just keep assigning to the same cyclic partition until 866 // NumUnsafeDependencesActive reaches 0. 867 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 868 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 869 *InterestingDependences); 870 871 int NumUnsafeDependencesActive = 0; 872 for (auto &InstDep : MID) { 873 Instruction *I = InstDep.Inst; 874 // We update NumUnsafeDependencesActive post-instruction, catch the 875 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 876 if (NumUnsafeDependencesActive || 877 InstDep.NumUnsafeDependencesStartOrEnd > 0) 878 Partitions.addToCyclicPartition(I); 879 else 880 Partitions.addToNewNonCyclicPartition(I); 881 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 882 assert(NumUnsafeDependencesActive >= 0 && 883 "Negative number of dependences active"); 884 } 885 886 // Add partitions for values used outside. These partitions can be out of 887 // order from the original program order. This is OK because if the 888 // partition uses a load we will merge this partition with the original 889 // partition of the load that we set up in the previous loop (see 890 // mergeToAvoidDuplicatedLoads). 891 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 892 for (auto *Inst : DefsUsedOutside) 893 Partitions.addToNewNonCyclicPartition(Inst); 894 895 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 896 if (Partitions.getSize() < 2) 897 return false; 898 899 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 900 // should be able to vectorize these together. 901 Partitions.mergeBeforePopulating(); 902 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 903 if (Partitions.getSize() < 2) 904 return false; 905 906 // Now, populate the partitions with non-memory operations. 907 Partitions.populateUsedSet(); 908 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 909 910 // In order to preserve original lexical order for loads, keep them in the 911 // partition that we set up in the MemoryInstructionDependences loop. 912 if (Partitions.mergeToAvoidDuplicatedLoads()) { 913 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 914 << Partitions); 915 if (Partitions.getSize() < 2) 916 return false; 917 } 918 919 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 920 // We're done forming the partitions set up the reverse mapping from 921 // instructions to partitions. 922 Partitions.setupPartitionIdOnInstructions(); 923 924 // To keep things simple have an empty preheader before we version or clone 925 // the loop. (Also split if this has no predecessor, i.e. entry, because we 926 // rely on PH having a predecessor.) 927 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 928 SplitBlock(PH, PH->getTerminator(), DT, LI); 929 930 // If we need run-time checks to disambiguate pointers are run-time, version 931 // the loop now. 932 RuntimeCheckEmitter RtCheckEmitter(LAI, L, LI, DT); 933 RtCheckEmitter.partitionPointers(Partitions); 934 if (RtCheckEmitter.needsRuntimeChecks()) { 935 RtCheckEmitter.versionLoop(this); 936 RtCheckEmitter.addPHINodes(DefsUsedOutside); 937 } 938 939 // Create identical copies of the original loop for each partition and hook 940 // them up sequentially. 941 Partitions.cloneLoops(this); 942 943 // Now, we remove the instruction from each loop that don't belong to that 944 // partition. 945 Partitions.removeUnusedInsts(); 946 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 947 DEBUG(Partitions.printBlocks()); 948 949 if (LDistVerify) { 950 LI->verify(); 951 DT->verifyDomTree(); 952 } 953 954 ++NumLoopsDistributed; 955 return true; 956 } 957 958 // Analyses used. 959 LoopInfo *LI; 960 LoopAccessAnalysis *LAA; 961 DominatorTree *DT; 962 }; 963 } // anonymous namespace 964 965 char LoopDistribute::ID; 966 static const char ldist_name[] = "Loop Distribition"; 967 968 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 969 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 970 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 971 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 972 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 973 974 namespace llvm { 975 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } 976 } 977