1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include "llvm/Transforms/Utils/LoopUtils.h" 38 #include "llvm/Transforms/Utils/LoopVersioning.h" 39 #include <list> 40 41 #define LDIST_NAME "loop-distribute" 42 #define DEBUG_TYPE LDIST_NAME 43 44 using namespace llvm; 45 46 static cl::opt<bool> 47 LDistVerify("loop-distribute-verify", cl::Hidden, 48 cl::desc("Turn on DominatorTree and LoopInfo verification " 49 "after Loop Distribution"), 50 cl::init(false)); 51 52 static cl::opt<bool> DistributeNonIfConvertible( 53 "loop-distribute-non-if-convertible", cl::Hidden, 54 cl::desc("Whether to distribute into a loop that may not be " 55 "if-convertible by the loop vectorizer"), 56 cl::init(false)); 57 58 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 59 60 namespace { 61 /// \brief Maintains the set of instructions of the loop for a partition before 62 /// cloning. After cloning, it hosts the new loop. 63 class InstPartition { 64 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 65 66 public: 67 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 68 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 69 Set.insert(I); 70 } 71 72 /// \brief Returns whether this partition contains a dependence cycle. 73 bool hasDepCycle() const { return DepCycle; } 74 75 /// \brief Adds an instruction to this partition. 76 void add(Instruction *I) { Set.insert(I); } 77 78 /// \brief Collection accessors. 79 InstructionSet::iterator begin() { return Set.begin(); } 80 InstructionSet::iterator end() { return Set.end(); } 81 InstructionSet::const_iterator begin() const { return Set.begin(); } 82 InstructionSet::const_iterator end() const { return Set.end(); } 83 bool empty() const { return Set.empty(); } 84 85 /// \brief Moves this partition into \p Other. This partition becomes empty 86 /// after this. 87 void moveTo(InstPartition &Other) { 88 Other.Set.insert(Set.begin(), Set.end()); 89 Set.clear(); 90 Other.DepCycle |= DepCycle; 91 } 92 93 /// \brief Populates the partition with a transitive closure of all the 94 /// instructions that the seeded instructions dependent on. 95 void populateUsedSet() { 96 // FIXME: We currently don't use control-dependence but simply include all 97 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 98 // up. 99 for (auto *B : OrigLoop->getBlocks()) 100 Set.insert(B->getTerminator()); 101 102 // Follow the use-def chains to form a transitive closure of all the 103 // instructions that the originally seeded instructions depend on. 104 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 105 while (!Worklist.empty()) { 106 Instruction *I = Worklist.pop_back_val(); 107 // Insert instructions from the loop that we depend on. 108 for (Value *V : I->operand_values()) { 109 auto *I = dyn_cast<Instruction>(V); 110 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 111 Worklist.push_back(I); 112 } 113 } 114 } 115 116 /// \brief Clones the original loop. 117 /// 118 /// Updates LoopInfo and DominatorTree using the information that block \p 119 /// LoopDomBB dominates the loop. 120 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 121 unsigned Index, LoopInfo *LI, 122 DominatorTree *DT) { 123 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 124 VMap, Twine(".ldist") + Twine(Index), 125 LI, DT, ClonedLoopBlocks); 126 return ClonedLoop; 127 } 128 129 /// \brief The cloned loop. If this partition is mapped to the original loop, 130 /// this is null. 131 const Loop *getClonedLoop() const { return ClonedLoop; } 132 133 /// \brief Returns the loop where this partition ends up after distribution. 134 /// If this partition is mapped to the original loop then use the block from 135 /// the loop. 136 const Loop *getDistributedLoop() const { 137 return ClonedLoop ? ClonedLoop : OrigLoop; 138 } 139 140 /// \brief The VMap that is populated by cloning and then used in 141 /// remapinstruction to remap the cloned instructions. 142 ValueToValueMapTy &getVMap() { return VMap; } 143 144 /// \brief Remaps the cloned instructions using VMap. 145 void remapInstructions() { 146 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 147 } 148 149 /// \brief Based on the set of instructions selected for this partition, 150 /// removes the unnecessary ones. 151 void removeUnusedInsts() { 152 SmallVector<Instruction *, 8> Unused; 153 154 for (auto *Block : OrigLoop->getBlocks()) 155 for (auto &Inst : *Block) 156 if (!Set.count(&Inst)) { 157 Instruction *NewInst = &Inst; 158 if (!VMap.empty()) 159 NewInst = cast<Instruction>(VMap[NewInst]); 160 161 assert(!isa<BranchInst>(NewInst) && 162 "Branches are marked used early on"); 163 Unused.push_back(NewInst); 164 } 165 166 // Delete the instructions backwards, as it has a reduced likelihood of 167 // having to update as many def-use and use-def chains. 168 for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { 169 if (!Inst->use_empty()) 170 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 171 Inst->eraseFromParent(); 172 } 173 } 174 175 void print() const { 176 if (DepCycle) 177 dbgs() << " (cycle)\n"; 178 for (auto *I : Set) 179 // Prefix with the block name. 180 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 181 } 182 183 void printBlocks() const { 184 for (auto *BB : getDistributedLoop()->getBlocks()) 185 dbgs() << *BB; 186 } 187 188 private: 189 /// \brief Instructions from OrigLoop selected for this partition. 190 InstructionSet Set; 191 192 /// \brief Whether this partition contains a dependence cycle. 193 bool DepCycle; 194 195 /// \brief The original loop. 196 Loop *OrigLoop; 197 198 /// \brief The cloned loop. If this partition is mapped to the original loop, 199 /// this is null. 200 Loop *ClonedLoop; 201 202 /// \brief The blocks of ClonedLoop including the preheader. If this 203 /// partition is mapped to the original loop, this is empty. 204 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 205 206 /// \brief These gets populated once the set of instructions have been 207 /// finalized. If this partition is mapped to the original loop, these are not 208 /// set. 209 ValueToValueMapTy VMap; 210 }; 211 212 /// \brief Holds the set of Partitions. It populates them, merges them and then 213 /// clones the loops. 214 class InstPartitionContainer { 215 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 216 217 public: 218 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 219 : L(L), LI(LI), DT(DT) {} 220 221 /// \brief Returns the number of partitions. 222 unsigned getSize() const { return PartitionContainer.size(); } 223 224 /// \brief Adds \p Inst into the current partition if that is marked to 225 /// contain cycles. Otherwise start a new partition for it. 226 void addToCyclicPartition(Instruction *Inst) { 227 // If the current partition is non-cyclic. Start a new one. 228 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 229 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 230 else 231 PartitionContainer.back().add(Inst); 232 } 233 234 /// \brief Adds \p Inst into a partition that is not marked to contain 235 /// dependence cycles. 236 /// 237 // Initially we isolate memory instructions into as many partitions as 238 // possible, then later we may merge them back together. 239 void addToNewNonCyclicPartition(Instruction *Inst) { 240 PartitionContainer.emplace_back(Inst, L); 241 } 242 243 /// \brief Merges adjacent non-cyclic partitions. 244 /// 245 /// The idea is that we currently only want to isolate the non-vectorizable 246 /// partition. We could later allow more distribution among these partition 247 /// too. 248 void mergeAdjacentNonCyclic() { 249 mergeAdjacentPartitionsIf( 250 [](const InstPartition *P) { return !P->hasDepCycle(); }); 251 } 252 253 /// \brief If a partition contains only conditional stores, we won't vectorize 254 /// it. Try to merge it with a previous cyclic partition. 255 void mergeNonIfConvertible() { 256 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 257 if (Partition->hasDepCycle()) 258 return true; 259 260 // Now, check if all stores are conditional in this partition. 261 bool seenStore = false; 262 263 for (auto *Inst : *Partition) 264 if (isa<StoreInst>(Inst)) { 265 seenStore = true; 266 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 267 return false; 268 } 269 return seenStore; 270 }); 271 } 272 273 /// \brief Merges the partitions according to various heuristics. 274 void mergeBeforePopulating() { 275 mergeAdjacentNonCyclic(); 276 if (!DistributeNonIfConvertible) 277 mergeNonIfConvertible(); 278 } 279 280 /// \brief Merges partitions in order to ensure that no loads are duplicated. 281 /// 282 /// We can't duplicate loads because that could potentially reorder them. 283 /// LoopAccessAnalysis provides dependency information with the context that 284 /// the order of memory operation is preserved. 285 /// 286 /// Return if any partitions were merged. 287 bool mergeToAvoidDuplicatedLoads() { 288 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 289 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 290 291 LoadToPartitionT LoadToPartition; 292 ToBeMergedT ToBeMerged; 293 294 // Step through the partitions and create equivalence between partitions 295 // that contain the same load. Also put partitions in between them in the 296 // same equivalence class to avoid reordering of memory operations. 297 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 298 E = PartitionContainer.end(); 299 I != E; ++I) { 300 auto *PartI = &*I; 301 302 // If a load occurs in two partitions PartI and PartJ, merge all 303 // partitions (PartI, PartJ] into PartI. 304 for (Instruction *Inst : *PartI) 305 if (isa<LoadInst>(Inst)) { 306 bool NewElt; 307 LoadToPartitionT::iterator LoadToPart; 308 309 std::tie(LoadToPart, NewElt) = 310 LoadToPartition.insert(std::make_pair(Inst, PartI)); 311 if (!NewElt) { 312 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 313 << "partitions: " << PartI << ", " 314 << LoadToPart->second << "\n" << *Inst << "\n"); 315 316 auto PartJ = I; 317 do { 318 --PartJ; 319 ToBeMerged.unionSets(PartI, &*PartJ); 320 } while (&*PartJ != LoadToPart->second); 321 } 322 } 323 } 324 if (ToBeMerged.empty()) 325 return false; 326 327 // Merge the member of an equivalence class into its class leader. This 328 // makes the members empty. 329 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 330 I != E; ++I) { 331 if (!I->isLeader()) 332 continue; 333 334 auto PartI = I->getData(); 335 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 336 ToBeMerged.member_end())) { 337 PartJ->moveTo(*PartI); 338 } 339 } 340 341 // Remove the empty partitions. 342 PartitionContainer.remove_if( 343 [](const InstPartition &P) { return P.empty(); }); 344 345 return true; 346 } 347 348 /// \brief Sets up the mapping between instructions to partitions. If the 349 /// instruction is duplicated across multiple partitions, set the entry to -1. 350 void setupPartitionIdOnInstructions() { 351 int PartitionID = 0; 352 for (const auto &Partition : PartitionContainer) { 353 for (Instruction *Inst : Partition) { 354 bool NewElt; 355 InstToPartitionIdT::iterator Iter; 356 357 std::tie(Iter, NewElt) = 358 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 359 if (!NewElt) 360 Iter->second = -1; 361 } 362 ++PartitionID; 363 } 364 } 365 366 /// \brief Populates the partition with everything that the seeding 367 /// instructions require. 368 void populateUsedSet() { 369 for (auto &P : PartitionContainer) 370 P.populateUsedSet(); 371 } 372 373 /// \brief This performs the main chunk of the work of cloning the loops for 374 /// the partitions. 375 void cloneLoops(Pass *P) { 376 BasicBlock *OrigPH = L->getLoopPreheader(); 377 // At this point the predecessor of the preheader is either the memcheck 378 // block or the top part of the original preheader. 379 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 380 assert(Pred && "Preheader does not have a single predecessor"); 381 BasicBlock *ExitBlock = L->getExitBlock(); 382 assert(ExitBlock && "No single exit block"); 383 Loop *NewLoop; 384 385 assert(!PartitionContainer.empty() && "at least two partitions expected"); 386 // We're cloning the preheader along with the loop so we already made sure 387 // it was empty. 388 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 389 "preheader not empty"); 390 391 // Create a loop for each partition except the last. Clone the original 392 // loop before PH along with adding a preheader for the cloned loop. Then 393 // update PH to point to the newly added preheader. 394 BasicBlock *TopPH = OrigPH; 395 unsigned Index = getSize() - 1; 396 for (auto I = std::next(PartitionContainer.rbegin()), 397 E = PartitionContainer.rend(); 398 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 399 auto *Part = &*I; 400 401 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 402 403 Part->getVMap()[ExitBlock] = TopPH; 404 Part->remapInstructions(); 405 } 406 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 407 408 // Now go in forward order and update the immediate dominator for the 409 // preheaders with the exiting block of the previous loop. Dominance 410 // within the loop is updated in cloneLoopWithPreheader. 411 for (auto Curr = PartitionContainer.cbegin(), 412 Next = std::next(PartitionContainer.cbegin()), 413 E = PartitionContainer.cend(); 414 Next != E; ++Curr, ++Next) 415 DT->changeImmediateDominator( 416 Next->getDistributedLoop()->getLoopPreheader(), 417 Curr->getDistributedLoop()->getExitingBlock()); 418 } 419 420 /// \brief Removes the dead instructions from the cloned loops. 421 void removeUnusedInsts() { 422 for (auto &Partition : PartitionContainer) 423 Partition.removeUnusedInsts(); 424 } 425 426 /// \brief For each memory pointer, it computes the partitionId the pointer is 427 /// used in. 428 /// 429 /// This returns an array of int where the I-th entry corresponds to I-th 430 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 431 /// partitions its entry is set to -1. 432 SmallVector<int, 8> 433 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 434 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 435 436 unsigned N = RtPtrCheck->Pointers.size(); 437 SmallVector<int, 8> PtrToPartitions(N); 438 for (unsigned I = 0; I < N; ++I) { 439 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 440 auto Instructions = 441 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 442 443 int &Partition = PtrToPartitions[I]; 444 // First set it to uninitialized. 445 Partition = -2; 446 for (Instruction *Inst : Instructions) { 447 // Note that this could be -1 if Inst is duplicated across multiple 448 // partitions. 449 int ThisPartition = this->InstToPartitionId[Inst]; 450 if (Partition == -2) 451 Partition = ThisPartition; 452 // -1 means belonging to multiple partitions. 453 else if (Partition == -1) 454 break; 455 else if (Partition != (int)ThisPartition) 456 Partition = -1; 457 } 458 assert(Partition != -2 && "Pointer not belonging to any partition"); 459 } 460 461 return PtrToPartitions; 462 } 463 464 void print(raw_ostream &OS) const { 465 unsigned Index = 0; 466 for (const auto &P : PartitionContainer) { 467 OS << "Partition " << Index++ << " (" << &P << "):\n"; 468 P.print(); 469 } 470 } 471 472 void dump() const { print(dbgs()); } 473 474 #ifndef NDEBUG 475 friend raw_ostream &operator<<(raw_ostream &OS, 476 const InstPartitionContainer &Partitions) { 477 Partitions.print(OS); 478 return OS; 479 } 480 #endif 481 482 void printBlocks() const { 483 unsigned Index = 0; 484 for (const auto &P : PartitionContainer) { 485 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 486 P.printBlocks(); 487 } 488 } 489 490 private: 491 typedef std::list<InstPartition> PartitionContainerT; 492 493 /// \brief List of partitions. 494 PartitionContainerT PartitionContainer; 495 496 /// \brief Mapping from Instruction to partition Id. If the instruction 497 /// belongs to multiple partitions the entry contains -1. 498 InstToPartitionIdT InstToPartitionId; 499 500 Loop *L; 501 LoopInfo *LI; 502 DominatorTree *DT; 503 504 /// \brief The control structure to merge adjacent partitions if both satisfy 505 /// the \p Predicate. 506 template <class UnaryPredicate> 507 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 508 InstPartition *PrevMatch = nullptr; 509 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 510 auto DoesMatch = Predicate(&*I); 511 if (PrevMatch == nullptr && DoesMatch) { 512 PrevMatch = &*I; 513 ++I; 514 } else if (PrevMatch != nullptr && DoesMatch) { 515 I->moveTo(*PrevMatch); 516 I = PartitionContainer.erase(I); 517 } else { 518 PrevMatch = nullptr; 519 ++I; 520 } 521 } 522 } 523 }; 524 525 /// \brief For each memory instruction, this class maintains difference of the 526 /// number of unsafe dependences that start out from this instruction minus 527 /// those that end here. 528 /// 529 /// By traversing the memory instructions in program order and accumulating this 530 /// number, we know whether any unsafe dependence crosses over a program point. 531 class MemoryInstructionDependences { 532 typedef MemoryDepChecker::Dependence Dependence; 533 534 public: 535 struct Entry { 536 Instruction *Inst; 537 unsigned NumUnsafeDependencesStartOrEnd; 538 539 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 540 }; 541 542 typedef SmallVector<Entry, 8> AccessesType; 543 544 AccessesType::const_iterator begin() const { return Accesses.begin(); } 545 AccessesType::const_iterator end() const { return Accesses.end(); } 546 547 MemoryInstructionDependences( 548 const SmallVectorImpl<Instruction *> &Instructions, 549 const SmallVectorImpl<Dependence> &InterestingDependences) { 550 Accesses.append(Instructions.begin(), Instructions.end()); 551 552 DEBUG(dbgs() << "Backward dependences:\n"); 553 for (auto &Dep : InterestingDependences) 554 if (Dep.isPossiblyBackward()) { 555 // Note that the designations source and destination follow the program 556 // order, i.e. source is always first. (The direction is given by the 557 // DepType.) 558 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 559 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 560 561 DEBUG(Dep.print(dbgs(), 2, Instructions)); 562 } 563 } 564 565 private: 566 AccessesType Accesses; 567 }; 568 569 /// \brief The pass class. 570 class LoopDistribute : public FunctionPass { 571 public: 572 LoopDistribute() : FunctionPass(ID) { 573 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 574 } 575 576 bool runOnFunction(Function &F) override { 577 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 578 LAA = &getAnalysis<LoopAccessAnalysis>(); 579 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 580 581 // Build up a worklist of inner-loops to vectorize. This is necessary as the 582 // act of distributing a loop creates new loops and can invalidate iterators 583 // across the loops. 584 SmallVector<Loop *, 8> Worklist; 585 586 for (Loop *TopLevelLoop : *LI) 587 for (Loop *L : depth_first(TopLevelLoop)) 588 // We only handle inner-most loops. 589 if (L->empty()) 590 Worklist.push_back(L); 591 592 // Now walk the identified inner loops. 593 bool Changed = false; 594 for (Loop *L : Worklist) 595 Changed |= processLoop(L); 596 597 // Process each loop nest in the function. 598 return Changed; 599 } 600 601 void getAnalysisUsage(AnalysisUsage &AU) const override { 602 AU.addRequired<LoopInfoWrapperPass>(); 603 AU.addPreserved<LoopInfoWrapperPass>(); 604 AU.addRequired<LoopAccessAnalysis>(); 605 AU.addRequired<DominatorTreeWrapperPass>(); 606 AU.addPreserved<DominatorTreeWrapperPass>(); 607 } 608 609 static char ID; 610 611 private: 612 /// \brief Filter out checks between pointers from the same partition. 613 /// 614 /// \p PtrToPartition contains the partition number for pointers. Partition 615 /// number -1 means that the pointer is used in multiple partitions. In this 616 /// case we can't safely omit the check. 617 SmallVector<RuntimePointerChecking::PointerCheck, 4> 618 includeOnlyCrossPartitionChecks( 619 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 620 const SmallVectorImpl<int> &PtrToPartition, 621 const RuntimePointerChecking *RtPtrChecking) { 622 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 623 624 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 625 [&](const RuntimePointerChecking::PointerCheck &Check) { 626 for (unsigned PtrIdx1 : Check.first->Members) 627 for (unsigned PtrIdx2 : Check.second->Members) 628 // Only include this check if there is a pair of pointers 629 // that require checking and the pointers fall into 630 // separate partitions. 631 // 632 // (Note that we already know at this point that the two 633 // pointer groups need checking but it doesn't follow 634 // that each pair of pointers within the two groups need 635 // checking as well. 636 // 637 // In other words we don't want to include a check just 638 // because there is a pair of pointers between the two 639 // pointer groups that require checks and a different 640 // pair whose pointers fall into different partitions.) 641 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 642 !RuntimePointerChecking::arePointersInSamePartition( 643 PtrToPartition, PtrIdx1, PtrIdx2)) 644 return true; 645 return false; 646 }); 647 648 return Checks; 649 } 650 651 /// \brief Try to distribute an inner-most loop. 652 bool processLoop(Loop *L) { 653 assert(L->empty() && "Only process inner loops."); 654 655 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 656 << "\" checking " << *L << "\n"); 657 658 BasicBlock *PH = L->getLoopPreheader(); 659 if (!PH) { 660 DEBUG(dbgs() << "Skipping; no preheader"); 661 return false; 662 } 663 if (!L->getExitBlock()) { 664 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 665 return false; 666 } 667 // LAA will check that we only have a single exiting block. 668 669 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 670 671 // Currently, we only distribute to isolate the part of the loop with 672 // dependence cycles to enable partial vectorization. 673 if (LAI.canVectorizeMemory()) { 674 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 675 return false; 676 } 677 auto *InterestingDependences = 678 LAI.getDepChecker().getInterestingDependences(); 679 if (!InterestingDependences || InterestingDependences->empty()) { 680 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 681 return false; 682 } 683 684 InstPartitionContainer Partitions(L, LI, DT); 685 686 // First, go through each memory operation and assign them to consecutive 687 // partitions (the order of partitions follows program order). Put those 688 // with unsafe dependences into "cyclic" partition otherwise put each store 689 // in its own "non-cyclic" partition (we'll merge these later). 690 // 691 // Note that a memory operation (e.g. Load2 below) at a program point that 692 // has an unsafe dependence (Store3->Load1) spanning over it must be 693 // included in the same cyclic partition as the dependent operations. This 694 // is to preserve the original program order after distribution. E.g.: 695 // 696 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 697 // Load1 -. 1 0->1 698 // Load2 | /Unsafe/ 0 1 699 // Store3 -' -1 1->0 700 // Load4 0 0 701 // 702 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 703 // we just keep assigning to the same cyclic partition until 704 // NumUnsafeDependencesActive reaches 0. 705 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 706 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 707 *InterestingDependences); 708 709 int NumUnsafeDependencesActive = 0; 710 for (auto &InstDep : MID) { 711 Instruction *I = InstDep.Inst; 712 // We update NumUnsafeDependencesActive post-instruction, catch the 713 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 714 if (NumUnsafeDependencesActive || 715 InstDep.NumUnsafeDependencesStartOrEnd > 0) 716 Partitions.addToCyclicPartition(I); 717 else 718 Partitions.addToNewNonCyclicPartition(I); 719 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 720 assert(NumUnsafeDependencesActive >= 0 && 721 "Negative number of dependences active"); 722 } 723 724 // Add partitions for values used outside. These partitions can be out of 725 // order from the original program order. This is OK because if the 726 // partition uses a load we will merge this partition with the original 727 // partition of the load that we set up in the previous loop (see 728 // mergeToAvoidDuplicatedLoads). 729 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 730 for (auto *Inst : DefsUsedOutside) 731 Partitions.addToNewNonCyclicPartition(Inst); 732 733 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 734 if (Partitions.getSize() < 2) 735 return false; 736 737 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 738 // should be able to vectorize these together. 739 Partitions.mergeBeforePopulating(); 740 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 741 if (Partitions.getSize() < 2) 742 return false; 743 744 // Now, populate the partitions with non-memory operations. 745 Partitions.populateUsedSet(); 746 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 747 748 // In order to preserve original lexical order for loads, keep them in the 749 // partition that we set up in the MemoryInstructionDependences loop. 750 if (Partitions.mergeToAvoidDuplicatedLoads()) { 751 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 752 << Partitions); 753 if (Partitions.getSize() < 2) 754 return false; 755 } 756 757 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 758 // We're done forming the partitions set up the reverse mapping from 759 // instructions to partitions. 760 Partitions.setupPartitionIdOnInstructions(); 761 762 // To keep things simple have an empty preheader before we version or clone 763 // the loop. (Also split if this has no predecessor, i.e. entry, because we 764 // rely on PH having a predecessor.) 765 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 766 SplitBlock(PH, PH->getTerminator(), DT, LI); 767 768 // If we need run-time checks to disambiguate pointers are run-time, version 769 // the loop now. 770 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 771 const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); 772 const auto &AllChecks = RtPtrChecking->getChecks(); 773 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 774 RtPtrChecking); 775 if (!Checks.empty()) { 776 DEBUG(dbgs() << "\nPointers:\n"); 777 DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 778 LoopVersioning LVer(std::move(Checks), LAI, L, LI, DT); 779 LVer.versionLoop(); 780 LVer.addPHINodes(DefsUsedOutside); 781 } 782 783 // Create identical copies of the original loop for each partition and hook 784 // them up sequentially. 785 Partitions.cloneLoops(this); 786 787 // Now, we remove the instruction from each loop that don't belong to that 788 // partition. 789 Partitions.removeUnusedInsts(); 790 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 791 DEBUG(Partitions.printBlocks()); 792 793 if (LDistVerify) { 794 LI->verify(); 795 DT->verifyDomTree(); 796 } 797 798 ++NumLoopsDistributed; 799 return true; 800 } 801 802 // Analyses used. 803 LoopInfo *LI; 804 LoopAccessAnalysis *LAA; 805 DominatorTree *DT; 806 }; 807 } // anonymous namespace 808 809 char LoopDistribute::ID; 810 static const char ldist_name[] = "Loop Distribition"; 811 812 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 813 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 814 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 815 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 816 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 817 818 namespace llvm { 819 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } 820 } 821