1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include <list> 38 39 #define LDIST_NAME "loop-distribute" 40 #define DEBUG_TYPE LDIST_NAME 41 42 using namespace llvm; 43 44 static cl::opt<bool> 45 LDistVerify("loop-distribute-verify", cl::Hidden, 46 cl::desc("Turn on DominatorTree and LoopInfo verification " 47 "after Loop Distribution"), 48 cl::init(false)); 49 50 static cl::opt<bool> DistributeNonIfConvertible( 51 "loop-distribute-non-if-convertible", cl::Hidden, 52 cl::desc("Whether to distribute into a loop that may not be " 53 "if-convertible by the loop vectorizer"), 54 cl::init(false)); 55 56 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 57 58 namespace { 59 /// \brief Remaps instructions in a loop including the preheader. 60 void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks, 61 ValueToValueMapTy &VMap) { 62 // Rewrite the code to refer to itself. 63 for (auto *BB : Blocks) 64 for (auto &Inst : *BB) 65 RemapInstruction(&Inst, VMap, 66 RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); 67 } 68 69 /// \brief Clones a loop \p OrigLoop. Returns the loop and the blocks in \p 70 /// Blocks. 71 /// 72 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block 73 /// \p LoopDomBB. Insert the new blocks before block specified in \p Before. 74 static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, 75 Loop *OrigLoop, ValueToValueMapTy &VMap, 76 const Twine &NameSuffix, LoopInfo *LI, 77 DominatorTree *DT, 78 SmallVectorImpl<BasicBlock *> &Blocks) { 79 Function *F = OrigLoop->getHeader()->getParent(); 80 Loop *ParentLoop = OrigLoop->getParentLoop(); 81 82 Loop *NewLoop = new Loop(); 83 if (ParentLoop) 84 ParentLoop->addChildLoop(NewLoop); 85 else 86 LI->addTopLevelLoop(NewLoop); 87 88 BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); 89 BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); 90 // To rename the loop PHIs. 91 VMap[OrigPH] = NewPH; 92 Blocks.push_back(NewPH); 93 94 // Update LoopInfo. 95 if (ParentLoop) 96 ParentLoop->addBasicBlockToLoop(NewPH, *LI); 97 98 // Update DominatorTree. 99 DT->addNewBlock(NewPH, LoopDomBB); 100 101 for (BasicBlock *BB : OrigLoop->getBlocks()) { 102 BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F); 103 VMap[BB] = NewBB; 104 105 // Update LoopInfo. 106 NewLoop->addBasicBlockToLoop(NewBB, *LI); 107 108 // Update DominatorTree. 109 BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock(); 110 DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB])); 111 112 Blocks.push_back(NewBB); 113 } 114 115 // Move them physically from the end of the block list. 116 F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); 117 F->getBasicBlockList().splice(Before, F->getBasicBlockList(), 118 NewLoop->getHeader(), F->end()); 119 120 return NewLoop; 121 } 122 123 /// \brief Maintains the set of instructions of the loop for a partition before 124 /// cloning. After cloning, it hosts the new loop. 125 class InstPartition { 126 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 127 128 public: 129 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 130 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 131 Set.insert(I); 132 } 133 134 /// \brief Returns whether this partition contains a dependence cycle. 135 bool hasDepCycle() const { return DepCycle; } 136 137 /// \brief Adds an instruction to this partition. 138 void add(Instruction *I) { Set.insert(I); } 139 140 /// \brief Collection accessors. 141 InstructionSet::iterator begin() { return Set.begin(); } 142 InstructionSet::iterator end() { return Set.end(); } 143 InstructionSet::const_iterator begin() const { return Set.begin(); } 144 InstructionSet::const_iterator end() const { return Set.end(); } 145 bool empty() const { return Set.empty(); } 146 147 /// \brief Moves this partition into \p Other. This partition becomes empty 148 /// after this. 149 void moveTo(InstPartition &Other) { 150 Other.Set.insert(Set.begin(), Set.end()); 151 Set.clear(); 152 Other.DepCycle |= DepCycle; 153 } 154 155 /// \brief Populates the partition with a transitive closure of all the 156 /// instructions that the seeded instructions dependent on. 157 void populateUsedSet() { 158 // FIXME: We currently don't use control-dependence but simply include all 159 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 160 // up. 161 for (auto *B : OrigLoop->getBlocks()) 162 Set.insert(B->getTerminator()); 163 164 // Follow the use-def chains to form a transitive closure of all the 165 // instructions that the originally seeded instructions depend on. 166 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 167 while (!Worklist.empty()) { 168 Instruction *I = Worklist.pop_back_val(); 169 // Insert instructions from the loop that we depend on. 170 for (Value *V : I->operand_values()) { 171 auto *I = dyn_cast<Instruction>(V); 172 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 173 Worklist.push_back(I); 174 } 175 } 176 } 177 178 /// \brief Clones the original loop. 179 /// 180 /// Updates LoopInfo and DominatorTree using the information that block \p 181 /// LoopDomBB dominates the loop. 182 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 183 unsigned Index, LoopInfo *LI, 184 DominatorTree *DT) { 185 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 186 VMap, Twine(".ldist") + Twine(Index), 187 LI, DT, ClonedLoopBlocks); 188 return ClonedLoop; 189 } 190 191 /// \brief The cloned loop. If this partition is mapped to the original loop, 192 /// this is null. 193 const Loop *getClonedLoop() const { return ClonedLoop; } 194 195 /// \brief Returns the loop where this partition ends up after distribution. 196 /// If this partition is mapped to the original loop then use the block from 197 /// the loop. 198 const Loop *getDistributedLoop() const { 199 return ClonedLoop ? ClonedLoop : OrigLoop; 200 } 201 202 /// \brief The VMap that is populated by cloning and then used in 203 /// remapinstruction to remap the cloned instructions. 204 ValueToValueMapTy &getVMap() { return VMap; } 205 206 /// \brief Remaps the cloned instructions using VMap. 207 void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); } 208 209 /// \brief Based on the set of instructions selected for this partition, 210 /// removes the unnecessary ones. 211 void removeUnusedInsts() { 212 SmallVector<Instruction *, 8> Unused; 213 214 for (auto *Block : OrigLoop->getBlocks()) 215 for (auto &Inst : *Block) 216 if (!Set.count(&Inst)) { 217 Instruction *NewInst = &Inst; 218 if (!VMap.empty()) 219 NewInst = cast<Instruction>(VMap[NewInst]); 220 221 assert(!isa<BranchInst>(NewInst) && 222 "Branches are marked used early on"); 223 Unused.push_back(NewInst); 224 } 225 226 // Delete the instructions backwards, as it has a reduced likelihood of 227 // having to update as many def-use and use-def chains. 228 for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { 229 auto *Inst = *I; 230 231 if (!Inst->use_empty()) 232 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 233 Inst->eraseFromParent(); 234 } 235 } 236 237 void print() { 238 if (DepCycle) 239 dbgs() << " (cycle)\n"; 240 for (auto *I : Set) 241 // Prefix with the block name. 242 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 243 } 244 245 void printBlocks() const { 246 for (auto *BB : getDistributedLoop()->getBlocks()) 247 dbgs() << *BB; 248 } 249 250 private: 251 /// \brief Instructions from OrigLoop selected for this partition. 252 InstructionSet Set; 253 254 /// \brief Whether this partition contains a dependence cycle. 255 bool DepCycle; 256 257 /// \brief The original loop. 258 Loop *OrigLoop; 259 260 /// \brief The cloned loop. If this partition is mapped to the original loop, 261 /// this is null. 262 Loop *ClonedLoop; 263 264 /// \brief The blocks of ClonedLoop including the preheader. If this 265 /// partition is mapped to the original loop, this is empty. 266 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 267 268 /// \brief These gets populated once the set of instructions have been 269 /// finalized. If this partition is mapped to the original loop, these are not 270 /// set. 271 ValueToValueMapTy VMap; 272 }; 273 274 /// \brief Holds the set of Partitions. It populates them, merges them and then 275 /// clones the loops. 276 class InstPartitionContainer { 277 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 278 279 public: 280 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 281 : L(L), LI(LI), DT(DT) {} 282 283 /// \brief Returns the number of partitions. 284 unsigned getSize() const { return PartitionContainer.size(); } 285 286 /// \brief Adds \p Inst into the current partition if that is marked to 287 /// contain cycles. Otherwise start a new partition for it. 288 void addToCyclicPartition(Instruction *Inst) { 289 // If the current partition is non-cyclic. Start a new one. 290 if (PartitionContainer.empty() || !PartitionContainer.back()->hasDepCycle()) 291 PartitionContainer.push_back( 292 llvm::make_unique<InstPartition>(Inst, L, true)); 293 else 294 PartitionContainer.back()->add(Inst); 295 } 296 297 /// \brief Adds \p Inst into a partition that is not marked to contain 298 /// dependence cycles. 299 /// 300 // Initially we isolate memory instructions into as many partitions as 301 // possible, then later we may merge them back together. 302 void addToNewNonCyclicPartition(Instruction *Inst) { 303 PartitionContainer.push_back(llvm::make_unique<InstPartition>(Inst, L)); 304 } 305 306 /// \brief Merges adjacent non-cyclic partitions. 307 /// 308 /// The idea is that we currently only want to isolate the non-vectorizable 309 /// partition. We could later allow more distribution among these partition 310 /// too. 311 void mergeAdjacentNonCyclic() { 312 mergeAdjacentPartitionsIf( 313 [](const InstPartition *P) { return !P->hasDepCycle(); }); 314 } 315 316 /// \brief If a partition contains only conditional stores, we won't vectorize 317 /// it. Try to merge it with a previous cyclic partition. 318 void mergeNonIfConvertible() { 319 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 320 if (Partition->hasDepCycle()) 321 return true; 322 323 // Now, check if all stores are conditional in this partition. 324 bool seenStore = false; 325 326 for (auto *Inst : *Partition) 327 if (isa<StoreInst>(Inst)) { 328 seenStore = true; 329 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 330 return false; 331 } 332 return seenStore; 333 }); 334 } 335 336 /// \brief Merges the partitions according to various heuristics. 337 void mergeBeforePopulating() { 338 mergeAdjacentNonCyclic(); 339 if (!DistributeNonIfConvertible) 340 mergeNonIfConvertible(); 341 } 342 343 /// \brief Merges partitions in order to ensure that no loads are duplicated. 344 /// 345 /// We can't duplicate loads because that could potentially reorder them. 346 /// LoopAccessAnalysis provides dependency information with the context that 347 /// the order of memory operation is preserved. 348 /// 349 /// Return if any partitions were merged. 350 bool mergeToAvoidDuplicatedLoads() { 351 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 352 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 353 354 LoadToPartitionT LoadToPartition; 355 ToBeMergedT ToBeMerged; 356 357 // Step through the partitions and create equivalence between partitions 358 // that contain the same load. Also put partitions in between them in the 359 // same equivalence class to avoid reordering of memory operations. 360 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 361 E = PartitionContainer.end(); 362 I != E; ++I) { 363 auto *PartI = I->get(); 364 365 // If a load occurs in two partitions PartI and PartJ, merge all 366 // partitions (PartI, PartJ] into PartI. 367 for (Instruction *Inst : *PartI) 368 if (isa<LoadInst>(Inst)) { 369 bool NewElt; 370 LoadToPartitionT::iterator LoadToPart; 371 372 std::tie(LoadToPart, NewElt) = 373 LoadToPartition.insert(std::make_pair(Inst, PartI)); 374 if (!NewElt) { 375 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 376 << "partitions: " << PartI << ", " 377 << LoadToPart->second << "\n" << *Inst << "\n"); 378 379 auto PartJ = I; 380 do { 381 --PartJ; 382 ToBeMerged.unionSets(PartI, PartJ->get()); 383 } while (PartJ->get() != LoadToPart->second); 384 } 385 } 386 } 387 if (ToBeMerged.empty()) 388 return false; 389 390 // Merge the member of an equivalence class into its class leader. This 391 // makes the members empty. 392 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 393 I != E; ++I) { 394 if (!I->isLeader()) 395 continue; 396 397 auto PartI = I->getData(); 398 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 399 ToBeMerged.member_end())) { 400 PartJ->moveTo(*PartI); 401 } 402 } 403 404 // Remove the empty partitions. 405 for (PartitionContainerT::iterator PartI = PartitionContainer.begin(), 406 E = PartitionContainer.end(); 407 PartI != E;) 408 if ((*PartI)->empty()) 409 PartI = PartitionContainer.erase(PartI); 410 else 411 ++PartI; 412 413 return true; 414 } 415 416 /// \brief Sets up the mapping between instructions to partitions. If the 417 /// instruction is duplicated across multiple partitions, set the entry to -1. 418 void setupPartitionIdOnInstructions() { 419 int PartitionID = 0; 420 for (auto &PartitionPtr : PartitionContainer) { 421 for (Instruction *Inst : *PartitionPtr) { 422 bool NewElt; 423 InstToPartitionIdT::iterator Iter; 424 425 std::tie(Iter, NewElt) = 426 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 427 if (!NewElt) 428 Iter->second = -1; 429 } 430 ++PartitionID; 431 } 432 } 433 434 /// \brief Populates the partition with everything that the seeding 435 /// instructions require. 436 void populateUsedSet() { 437 for (auto &P : PartitionContainer) 438 P->populateUsedSet(); 439 } 440 441 /// \brief This performs the main chunk of the work of cloning the loops for 442 /// the partitions. 443 void cloneLoops(Pass *P) { 444 BasicBlock *OrigPH = L->getLoopPreheader(); 445 // At this point the predecessor of the preheader is either the memcheck 446 // block or the top part of the original preheader. 447 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 448 assert(Pred && "Preheader does not have a single predecessor"); 449 BasicBlock *ExitBlock = L->getExitBlock(); 450 assert(ExitBlock && "No single exit block"); 451 Loop *NewLoop; 452 453 assert(!PartitionContainer.empty() && "at least two partitions expected"); 454 // We're cloning the preheader along with the loop so we already made sure 455 // it was empty. 456 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 457 "preheader not empty"); 458 459 // Create a loop for each partition except the last. Clone the original 460 // loop before PH along with adding a preheader for the cloned loop. Then 461 // update PH to point to the newly added preheader. 462 BasicBlock *TopPH = OrigPH; 463 unsigned Index = getSize() - 1; 464 for (auto I = std::next(PartitionContainer.crbegin()), 465 E = PartitionContainer.crend(); 466 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 467 auto &Part = *I; 468 469 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 470 471 Part->getVMap()[ExitBlock] = TopPH; 472 Part->remapInstructions(); 473 } 474 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 475 476 // Now go in forward order and update the immediate dominator for the 477 // preheaders with the exiting block of the previous loop. Dominance 478 // within the loop is updated in cloneLoopWithPreheader. 479 for (auto Curr = PartitionContainer.cbegin(), 480 Next = std::next(PartitionContainer.cbegin()), 481 E = PartitionContainer.cend(); 482 Next != E; ++Curr, ++Next) 483 DT->changeImmediateDominator( 484 (*Next)->getDistributedLoop()->getLoopPreheader(), 485 (*Curr)->getDistributedLoop()->getExitingBlock()); 486 } 487 488 /// \brief Removes the dead instructions from the cloned loops. 489 void removeUnusedInsts() { 490 for (auto &PartitionPtr : PartitionContainer) 491 PartitionPtr->removeUnusedInsts(); 492 } 493 494 /// \brief For each memory pointer, it computes the partitionId the pointer is 495 /// used in. 496 /// 497 /// This returns an array of int where the I-th entry corresponds to I-th 498 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 499 /// partitions its entry is set to -1. 500 SmallVector<int, 8> 501 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 502 const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck = 503 LAI.getRuntimePointerCheck(); 504 505 unsigned N = RtPtrCheck->Pointers.size(); 506 SmallVector<int, 8> PtrToPartitions(N); 507 for (unsigned I = 0; I < N; ++I) { 508 Value *Ptr = RtPtrCheck->Pointers[I]; 509 auto Instructions = 510 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]); 511 512 int &Partition = PtrToPartitions[I]; 513 // First set it to uninitialized. 514 Partition = -2; 515 for (Instruction *Inst : Instructions) { 516 // Note that this could be -1 if Inst is duplicated across multiple 517 // partitions. 518 int ThisPartition = this->InstToPartitionId[Inst]; 519 if (Partition == -2) 520 Partition = ThisPartition; 521 // -1 means belonging to multiple partitions. 522 else if (Partition == -1) 523 break; 524 else if (Partition != (int)ThisPartition) 525 Partition = -1; 526 } 527 assert(Partition != -2 && "Pointer not belonging to any partition"); 528 } 529 530 return PtrToPartitions; 531 } 532 533 void print(raw_ostream &OS) const { 534 unsigned Index = 0; 535 for (auto &P : PartitionContainer) { 536 OS << "Partition " << Index++ << " (" << P.get() << "):\n"; 537 P->print(); 538 } 539 } 540 541 void dump() const { print(dbgs()); } 542 543 #ifndef NDEBUG 544 friend raw_ostream &operator<<(raw_ostream &OS, 545 const InstPartitionContainer &Partitions) { 546 Partitions.print(OS); 547 return OS; 548 } 549 #endif 550 551 void printBlocks() const { 552 unsigned Index = 0; 553 for (auto &P : PartitionContainer) { 554 dbgs() << "\nPartition " << Index++ << " (" << P.get() << "):\n"; 555 P->printBlocks(); 556 } 557 } 558 559 private: 560 typedef std::list<std::unique_ptr<InstPartition>> PartitionContainerT; 561 562 /// \brief List of partitions. 563 PartitionContainerT PartitionContainer; 564 565 /// \brief Mapping from Instruction to partition Id. If the instruction 566 /// belongs to multiple partitions the entry contains -1. 567 InstToPartitionIdT InstToPartitionId; 568 569 Loop *L; 570 LoopInfo *LI; 571 DominatorTree *DT; 572 573 /// \brief The control structure to merge adjacent partitions if both satisfy 574 /// the \p Predicate. 575 template <class UnaryPredicate> 576 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 577 InstPartition *PrevMatch = nullptr; 578 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 579 auto DoesMatch = Predicate(I->get()); 580 if (PrevMatch == nullptr && DoesMatch) { 581 PrevMatch = I->get(); 582 ++I; 583 } else if (PrevMatch != nullptr && DoesMatch) { 584 (*I)->moveTo(*PrevMatch); 585 I = PartitionContainer.erase(I); 586 } else { 587 PrevMatch = nullptr; 588 ++I; 589 } 590 } 591 } 592 }; 593 594 /// \brief For each memory instruction, this class maintains difference of the 595 /// number of unsafe dependences that start out from this instruction minus 596 /// those that end here. 597 /// 598 /// By traversing the memory instructions in program order and accumulating this 599 /// number, we know whether any unsafe dependence crosses over a program point. 600 class MemoryInstructionDependences { 601 typedef MemoryDepChecker::Dependence Dependence; 602 603 public: 604 struct Entry { 605 Instruction *Inst; 606 unsigned NumUnsafeDependencesStartOrEnd; 607 608 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 609 }; 610 611 typedef SmallVector<Entry, 8> AccessesType; 612 613 AccessesType::const_iterator begin() const { return Accesses.begin(); } 614 AccessesType::const_iterator end() const { return Accesses.end(); } 615 616 MemoryInstructionDependences( 617 const SmallVectorImpl<Instruction *> &Instructions, 618 const SmallVectorImpl<Dependence> &InterestingDependences) { 619 std::transform(Instructions.begin(), Instructions.end(), 620 std::back_inserter(Accesses), 621 [](Instruction *Inst) { return Entry(Inst); }); 622 623 DEBUG(dbgs() << "Backward dependences:\n"); 624 for (auto &Dep : InterestingDependences) 625 if (Dep.isPossiblyBackward()) { 626 // Note that the designations source and destination follow the program 627 // order, i.e. source is always first. (The direction is given by the 628 // DepType.) 629 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 630 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 631 632 DEBUG(Dep.print(dbgs(), 2, Instructions)); 633 } 634 } 635 636 private: 637 AccessesType Accesses; 638 }; 639 640 /// \brief Handles the loop versioning based on memchecks. 641 class RuntimeCheckEmitter { 642 public: 643 RuntimeCheckEmitter(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, 644 DominatorTree *DT) 645 : OrigLoop(L), NonDistributedLoop(nullptr), LAI(LAI), LI(LI), DT(DT) {} 646 647 /// \brief Given the \p Partitions formed by Loop Distribution, it determines 648 /// in which partition each pointer is used. 649 void partitionPointers(InstPartitionContainer &Partitions) { 650 // Set up partition id in PtrRtChecks. Ptr -> Access -> Intruction -> 651 // Partition. 652 PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 653 654 DEBUG(dbgs() << "\nPointers:\n"); 655 DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition)); 656 } 657 658 /// \brief Returns true if we need memchecks to distribute the loop. 659 bool needsRuntimeChecks() const { 660 return LAI.getRuntimePointerCheck()->needsAnyChecking(&PtrToPartition); 661 } 662 663 /// \brief Performs the CFG manipulation part of versioning the loop including 664 /// the DominatorTree and LoopInfo updates. 665 void versionLoop(Pass *P) { 666 Instruction *FirstCheckInst; 667 Instruction *MemRuntimeCheck; 668 // Add the memcheck in the original preheader (this is empty initially). 669 BasicBlock *MemCheckBB = OrigLoop->getLoopPreheader(); 670 std::tie(FirstCheckInst, MemRuntimeCheck) = 671 LAI.addRuntimeCheck(MemCheckBB->getTerminator(), &PtrToPartition); 672 assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); 673 674 // Rename the block to make the IR more readable. 675 MemCheckBB->setName(OrigLoop->getHeader()->getName() + ".ldist.memcheck"); 676 677 // Create empty preheader for the loop (and after cloning for the 678 // original/nondist loop). 679 BasicBlock *PH = 680 SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); 681 PH->setName(OrigLoop->getHeader()->getName() + ".ph"); 682 683 // Clone the loop including the preheader. 684 // 685 // FIXME: This does not currently preserve SimplifyLoop because the exit 686 // block is a join between the two loops. 687 SmallVector<BasicBlock *, 8> NonDistributedLoopBlocks; 688 NonDistributedLoop = 689 cloneLoopWithPreheader(PH, MemCheckBB, OrigLoop, VMap, ".ldist.nondist", 690 LI, DT, NonDistributedLoopBlocks); 691 remapInstructionsInLoop(NonDistributedLoopBlocks, VMap); 692 693 // Insert the conditional branch based on the result of the memchecks. 694 Instruction *OrigTerm = MemCheckBB->getTerminator(); 695 BranchInst::Create(NonDistributedLoop->getLoopPreheader(), 696 OrigLoop->getLoopPreheader(), MemRuntimeCheck, OrigTerm); 697 OrigTerm->eraseFromParent(); 698 699 // The loops merge in the original exit block. This is now dominated by the 700 // memchecking block. 701 DT->changeImmediateDominator(OrigLoop->getExitBlock(), MemCheckBB); 702 } 703 704 /// \brief Adds the necessary PHI nodes for the versioned loops based on the 705 /// loop-defined values used outside of the loop. 706 void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) { 707 BasicBlock *PHIBlock = OrigLoop->getExitBlock(); 708 assert(PHIBlock && "No single successor to loop exit block"); 709 710 for (auto *Inst : DefsUsedOutside) { 711 auto *NonDistInst = cast<Instruction>(VMap[Inst]); 712 PHINode *PN; 713 BasicBlock::iterator I; 714 715 // First see if we have a single-operand PHI with the value defined by the 716 // original loop. 717 for (I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) { 718 assert(PN->getNumOperands() == 1 && 719 "Exit block should only have on predecessor"); 720 if (PN->getIncomingValue(0) == Inst) 721 break; 722 } 723 // If not create it. 724 if (!PN) { 725 PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".ldist", 726 PHIBlock->begin()); 727 for (auto *User : Inst->users()) 728 if (!OrigLoop->contains(cast<Instruction>(User)->getParent())) 729 User->replaceUsesOfWith(Inst, PN); 730 PN->addIncoming(Inst, OrigLoop->getExitingBlock()); 731 } 732 // Add the new incoming value from the non-distributed loop. 733 PN->addIncoming(NonDistInst, NonDistributedLoop->getExitingBlock()); 734 } 735 } 736 737 private: 738 /// \brief The original loop. This becomes the "versioned" one, i.e. control 739 /// goes if the memchecks all pass. 740 Loop *OrigLoop; 741 /// \brief The fall-back loop, i.e. if any of the memchecks fail. 742 Loop *NonDistributedLoop; 743 744 /// \brief For each memory pointer it contains the partitionId it is used in. 745 /// 746 /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck(). 747 /// If the pointer is used in multiple partitions the entry is set to -1. 748 SmallVector<int, 8> PtrToPartition; 749 750 /// \brief This maps the instructions from OrigLoop to their counterpart in 751 /// NonDistributedLoop. 752 ValueToValueMapTy VMap; 753 754 /// \brief Analyses used. 755 const LoopAccessInfo &LAI; 756 LoopInfo *LI; 757 DominatorTree *DT; 758 }; 759 760 /// \brief Returns the instructions that use values defined in the loop. 761 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { 762 SmallVector<Instruction *, 8> UsedOutside; 763 764 for (auto *Block : L->getBlocks()) 765 // FIXME: I believe that this could use copy_if if the Inst reference could 766 // be adapted into a pointer. 767 for (auto &Inst : *Block) { 768 auto Users = Inst.users(); 769 if (std::any_of(Users.begin(), Users.end(), [&](User *U) { 770 auto *Use = cast<Instruction>(U); 771 return !L->contains(Use->getParent()); 772 })) 773 UsedOutside.push_back(&Inst); 774 } 775 776 return UsedOutside; 777 } 778 779 /// \brief The pass class. 780 class LoopDistribute : public FunctionPass { 781 public: 782 LoopDistribute() : FunctionPass(ID) { 783 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 784 } 785 786 bool runOnFunction(Function &F) override { 787 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 788 LAA = &getAnalysis<LoopAccessAnalysis>(); 789 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 790 791 // Build up a worklist of inner-loops to vectorize. This is necessary as the 792 // act of distributing a loop creates new loops and can invalidate iterators 793 // across the loops. 794 SmallVector<Loop *, 8> Worklist; 795 796 for (Loop *TopLevelLoop : *LI) 797 for (Loop *L : depth_first(TopLevelLoop)) 798 // We only handle inner-most loops. 799 if (L->empty()) 800 Worklist.push_back(L); 801 802 // Now walk the identified inner loops. 803 bool Changed = false; 804 for (Loop *L : Worklist) 805 Changed |= processLoop(L); 806 807 // Process each loop nest in the function. 808 return Changed; 809 } 810 811 void getAnalysisUsage(AnalysisUsage &AU) const override { 812 AU.addRequired<LoopInfoWrapperPass>(); 813 AU.addPreserved<LoopInfoWrapperPass>(); 814 AU.addRequired<LoopAccessAnalysis>(); 815 AU.addRequired<DominatorTreeWrapperPass>(); 816 AU.addPreserved<DominatorTreeWrapperPass>(); 817 } 818 819 static char ID; 820 821 private: 822 /// \brief Try to distribute an inner-most loop. 823 bool processLoop(Loop *L) { 824 assert(L->empty() && "Only process inner loops."); 825 826 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 827 << "\" checking " << *L << "\n"); 828 829 BasicBlock *PH = L->getLoopPreheader(); 830 if (!PH) { 831 DEBUG(dbgs() << "Skipping; no preheader"); 832 return false; 833 } 834 if (!L->getExitBlock()) { 835 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 836 return false; 837 } 838 // LAA will check that we only have a single exiting block. 839 840 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 841 842 // Currently, we only distribute to isolate the part of the loop with 843 // dependence cycles to enable partial vectorization. 844 if (LAI.canVectorizeMemory()) { 845 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 846 return false; 847 } 848 auto *InterestingDependences = 849 LAI.getDepChecker().getInterestingDependences(); 850 if (!InterestingDependences || InterestingDependences->empty()) { 851 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 852 return false; 853 } 854 855 InstPartitionContainer Partitions(L, LI, DT); 856 857 // First, go through each memory operation and assign them to consecutive 858 // partitions (the order of partitions follows program order). Put those 859 // with unsafe dependences into "cyclic" partition otherwise put each store 860 // in its own "non-cyclic" partition (we'll merge these later). 861 // 862 // Note that a memory operation (e.g. Load2 below) at a program point that 863 // has an unsafe dependence (Store3->Load1) spanning over it must be 864 // included in the same cyclic partition as the dependent operations. This 865 // is to preserve the original program order after distribution. E.g.: 866 // 867 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 868 // Load1 -. 1 0->1 869 // Load2 | /Unsafe/ 0 1 870 // Store3 -' -1 1->0 871 // Load4 0 0 872 // 873 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 874 // we just keep assigning to the same cyclic partition until 875 // NumUnsafeDependencesActive reaches 0. 876 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 877 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 878 *InterestingDependences); 879 880 int NumUnsafeDependencesActive = 0; 881 for (auto &InstDep : MID) { 882 Instruction *I = InstDep.Inst; 883 // We update NumUnsafeDependencesActive post-instruction, catch the 884 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 885 if (NumUnsafeDependencesActive || 886 InstDep.NumUnsafeDependencesStartOrEnd > 0) 887 Partitions.addToCyclicPartition(I); 888 else 889 Partitions.addToNewNonCyclicPartition(I); 890 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 891 assert(NumUnsafeDependencesActive >= 0 && 892 "Negative number of dependences active"); 893 } 894 895 // Add partitions for values used outside. These partitions can be out of 896 // order from the original program order. This is OK because if the 897 // partition uses a load we will merge this partition with the original 898 // partition of the load that we set up in the previous loop (see 899 // mergeToAvoidDuplicatedLoads). 900 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 901 for (auto *Inst : DefsUsedOutside) 902 Partitions.addToNewNonCyclicPartition(Inst); 903 904 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 905 if (Partitions.getSize() < 2) 906 return false; 907 908 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 909 // should be able to vectorize these together. 910 Partitions.mergeBeforePopulating(); 911 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 912 if (Partitions.getSize() < 2) 913 return false; 914 915 // Now, populate the partitions with non-memory operations. 916 Partitions.populateUsedSet(); 917 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 918 919 // In order to preserve original lexical order for loads, keep them in the 920 // partition that we set up in the MemoryInstructionDependences loop. 921 if (Partitions.mergeToAvoidDuplicatedLoads()) { 922 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 923 << Partitions); 924 if (Partitions.getSize() < 2) 925 return false; 926 } 927 928 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 929 // We're done forming the partitions set up the reverse mapping from 930 // instructions to partitions. 931 Partitions.setupPartitionIdOnInstructions(); 932 933 // To keep things simple have an empty preheader before we version or clone 934 // the loop. (Also split if this has no predecessor, i.e. entry, because we 935 // rely on PH having a predecessor.) 936 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 937 SplitBlock(PH, PH->getTerminator(), DT, LI); 938 939 // If we need run-time checks to disambiguate pointers are run-time, version 940 // the loop now. 941 RuntimeCheckEmitter RtCheckEmitter(LAI, L, LI, DT); 942 RtCheckEmitter.partitionPointers(Partitions); 943 if (RtCheckEmitter.needsRuntimeChecks()) { 944 RtCheckEmitter.versionLoop(this); 945 RtCheckEmitter.addPHINodes(DefsUsedOutside); 946 } 947 948 // Create identical copies of the original loop for each partition and hook 949 // them up sequentially. 950 Partitions.cloneLoops(this); 951 952 // Now, we remove the instruction from each loop that don't belong to that 953 // partition. 954 Partitions.removeUnusedInsts(); 955 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 956 DEBUG(Partitions.printBlocks()); 957 958 if (LDistVerify) { 959 LI->verify(); 960 DT->verifyDomTree(); 961 } 962 963 ++NumLoopsDistributed; 964 return true; 965 } 966 967 // Analyses used. 968 LoopInfo *LI; 969 LoopAccessAnalysis *LAA; 970 DominatorTree *DT; 971 }; 972 } // anonymous namespace 973 974 char LoopDistribute::ID; 975 static const char ldist_name[] = "Loop Distribition"; 976 977 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 978 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 979 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 980 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 981 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 982 983 namespace llvm { 984 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } 985 } 986