1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include "llvm/Transforms/Utils/LoopUtils.h" 38 #include "llvm/Transforms/Utils/LoopVersioning.h" 39 #include <list> 40 41 #define LDIST_NAME "loop-distribute" 42 #define DEBUG_TYPE LDIST_NAME 43 44 using namespace llvm; 45 46 static cl::opt<bool> 47 LDistVerify("loop-distribute-verify", cl::Hidden, 48 cl::desc("Turn on DominatorTree and LoopInfo verification " 49 "after Loop Distribution"), 50 cl::init(false)); 51 52 static cl::opt<bool> DistributeNonIfConvertible( 53 "loop-distribute-non-if-convertible", cl::Hidden, 54 cl::desc("Whether to distribute into a loop that may not be " 55 "if-convertible by the loop vectorizer"), 56 cl::init(false)); 57 58 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 59 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 60 cl::desc("The maximum number of SCEV checks allowed for Loop " 61 "Distribution")); 62 63 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 64 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 65 cl::Hidden, 66 cl::desc( 67 "The maximum number of SCEV checks allowed for Loop " 68 "Distribution for loop marked with #pragma loop distribute(enable)")); 69 70 // Note that the initial value for this depends on whether the pass is invoked 71 // directly or from the optimization pipeline. 72 static cl::opt<bool> EnableLoopDistribute( 73 "enable-loop-distribute", cl::Hidden, 74 cl::desc("Enable the new, experimental LoopDistribution Pass")); 75 76 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 77 78 namespace { 79 /// \brief Maintains the set of instructions of the loop for a partition before 80 /// cloning. After cloning, it hosts the new loop. 81 class InstPartition { 82 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 83 84 public: 85 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 86 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 87 Set.insert(I); 88 } 89 90 /// \brief Returns whether this partition contains a dependence cycle. 91 bool hasDepCycle() const { return DepCycle; } 92 93 /// \brief Adds an instruction to this partition. 94 void add(Instruction *I) { Set.insert(I); } 95 96 /// \brief Collection accessors. 97 InstructionSet::iterator begin() { return Set.begin(); } 98 InstructionSet::iterator end() { return Set.end(); } 99 InstructionSet::const_iterator begin() const { return Set.begin(); } 100 InstructionSet::const_iterator end() const { return Set.end(); } 101 bool empty() const { return Set.empty(); } 102 103 /// \brief Moves this partition into \p Other. This partition becomes empty 104 /// after this. 105 void moveTo(InstPartition &Other) { 106 Other.Set.insert(Set.begin(), Set.end()); 107 Set.clear(); 108 Other.DepCycle |= DepCycle; 109 } 110 111 /// \brief Populates the partition with a transitive closure of all the 112 /// instructions that the seeded instructions dependent on. 113 void populateUsedSet() { 114 // FIXME: We currently don't use control-dependence but simply include all 115 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 116 // up. 117 for (auto *B : OrigLoop->getBlocks()) 118 Set.insert(B->getTerminator()); 119 120 // Follow the use-def chains to form a transitive closure of all the 121 // instructions that the originally seeded instructions depend on. 122 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 123 while (!Worklist.empty()) { 124 Instruction *I = Worklist.pop_back_val(); 125 // Insert instructions from the loop that we depend on. 126 for (Value *V : I->operand_values()) { 127 auto *I = dyn_cast<Instruction>(V); 128 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 129 Worklist.push_back(I); 130 } 131 } 132 } 133 134 /// \brief Clones the original loop. 135 /// 136 /// Updates LoopInfo and DominatorTree using the information that block \p 137 /// LoopDomBB dominates the loop. 138 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 139 unsigned Index, LoopInfo *LI, 140 DominatorTree *DT) { 141 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 142 VMap, Twine(".ldist") + Twine(Index), 143 LI, DT, ClonedLoopBlocks); 144 return ClonedLoop; 145 } 146 147 /// \brief The cloned loop. If this partition is mapped to the original loop, 148 /// this is null. 149 const Loop *getClonedLoop() const { return ClonedLoop; } 150 151 /// \brief Returns the loop where this partition ends up after distribution. 152 /// If this partition is mapped to the original loop then use the block from 153 /// the loop. 154 const Loop *getDistributedLoop() const { 155 return ClonedLoop ? ClonedLoop : OrigLoop; 156 } 157 158 /// \brief The VMap that is populated by cloning and then used in 159 /// remapinstruction to remap the cloned instructions. 160 ValueToValueMapTy &getVMap() { return VMap; } 161 162 /// \brief Remaps the cloned instructions using VMap. 163 void remapInstructions() { 164 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 165 } 166 167 /// \brief Based on the set of instructions selected for this partition, 168 /// removes the unnecessary ones. 169 void removeUnusedInsts() { 170 SmallVector<Instruction *, 8> Unused; 171 172 for (auto *Block : OrigLoop->getBlocks()) 173 for (auto &Inst : *Block) 174 if (!Set.count(&Inst)) { 175 Instruction *NewInst = &Inst; 176 if (!VMap.empty()) 177 NewInst = cast<Instruction>(VMap[NewInst]); 178 179 assert(!isa<BranchInst>(NewInst) && 180 "Branches are marked used early on"); 181 Unused.push_back(NewInst); 182 } 183 184 // Delete the instructions backwards, as it has a reduced likelihood of 185 // having to update as many def-use and use-def chains. 186 for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { 187 if (!Inst->use_empty()) 188 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 189 Inst->eraseFromParent(); 190 } 191 } 192 193 void print() const { 194 if (DepCycle) 195 dbgs() << " (cycle)\n"; 196 for (auto *I : Set) 197 // Prefix with the block name. 198 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 199 } 200 201 void printBlocks() const { 202 for (auto *BB : getDistributedLoop()->getBlocks()) 203 dbgs() << *BB; 204 } 205 206 private: 207 /// \brief Instructions from OrigLoop selected for this partition. 208 InstructionSet Set; 209 210 /// \brief Whether this partition contains a dependence cycle. 211 bool DepCycle; 212 213 /// \brief The original loop. 214 Loop *OrigLoop; 215 216 /// \brief The cloned loop. If this partition is mapped to the original loop, 217 /// this is null. 218 Loop *ClonedLoop; 219 220 /// \brief The blocks of ClonedLoop including the preheader. If this 221 /// partition is mapped to the original loop, this is empty. 222 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 223 224 /// \brief These gets populated once the set of instructions have been 225 /// finalized. If this partition is mapped to the original loop, these are not 226 /// set. 227 ValueToValueMapTy VMap; 228 }; 229 230 /// \brief Holds the set of Partitions. It populates them, merges them and then 231 /// clones the loops. 232 class InstPartitionContainer { 233 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 234 235 public: 236 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 237 : L(L), LI(LI), DT(DT) {} 238 239 /// \brief Returns the number of partitions. 240 unsigned getSize() const { return PartitionContainer.size(); } 241 242 /// \brief Adds \p Inst into the current partition if that is marked to 243 /// contain cycles. Otherwise start a new partition for it. 244 void addToCyclicPartition(Instruction *Inst) { 245 // If the current partition is non-cyclic. Start a new one. 246 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 247 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 248 else 249 PartitionContainer.back().add(Inst); 250 } 251 252 /// \brief Adds \p Inst into a partition that is not marked to contain 253 /// dependence cycles. 254 /// 255 // Initially we isolate memory instructions into as many partitions as 256 // possible, then later we may merge them back together. 257 void addToNewNonCyclicPartition(Instruction *Inst) { 258 PartitionContainer.emplace_back(Inst, L); 259 } 260 261 /// \brief Merges adjacent non-cyclic partitions. 262 /// 263 /// The idea is that we currently only want to isolate the non-vectorizable 264 /// partition. We could later allow more distribution among these partition 265 /// too. 266 void mergeAdjacentNonCyclic() { 267 mergeAdjacentPartitionsIf( 268 [](const InstPartition *P) { return !P->hasDepCycle(); }); 269 } 270 271 /// \brief If a partition contains only conditional stores, we won't vectorize 272 /// it. Try to merge it with a previous cyclic partition. 273 void mergeNonIfConvertible() { 274 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 275 if (Partition->hasDepCycle()) 276 return true; 277 278 // Now, check if all stores are conditional in this partition. 279 bool seenStore = false; 280 281 for (auto *Inst : *Partition) 282 if (isa<StoreInst>(Inst)) { 283 seenStore = true; 284 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 285 return false; 286 } 287 return seenStore; 288 }); 289 } 290 291 /// \brief Merges the partitions according to various heuristics. 292 void mergeBeforePopulating() { 293 mergeAdjacentNonCyclic(); 294 if (!DistributeNonIfConvertible) 295 mergeNonIfConvertible(); 296 } 297 298 /// \brief Merges partitions in order to ensure that no loads are duplicated. 299 /// 300 /// We can't duplicate loads because that could potentially reorder them. 301 /// LoopAccessAnalysis provides dependency information with the context that 302 /// the order of memory operation is preserved. 303 /// 304 /// Return if any partitions were merged. 305 bool mergeToAvoidDuplicatedLoads() { 306 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 307 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 308 309 LoadToPartitionT LoadToPartition; 310 ToBeMergedT ToBeMerged; 311 312 // Step through the partitions and create equivalence between partitions 313 // that contain the same load. Also put partitions in between them in the 314 // same equivalence class to avoid reordering of memory operations. 315 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 316 E = PartitionContainer.end(); 317 I != E; ++I) { 318 auto *PartI = &*I; 319 320 // If a load occurs in two partitions PartI and PartJ, merge all 321 // partitions (PartI, PartJ] into PartI. 322 for (Instruction *Inst : *PartI) 323 if (isa<LoadInst>(Inst)) { 324 bool NewElt; 325 LoadToPartitionT::iterator LoadToPart; 326 327 std::tie(LoadToPart, NewElt) = 328 LoadToPartition.insert(std::make_pair(Inst, PartI)); 329 if (!NewElt) { 330 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 331 << "partitions: " << PartI << ", " 332 << LoadToPart->second << "\n" << *Inst << "\n"); 333 334 auto PartJ = I; 335 do { 336 --PartJ; 337 ToBeMerged.unionSets(PartI, &*PartJ); 338 } while (&*PartJ != LoadToPart->second); 339 } 340 } 341 } 342 if (ToBeMerged.empty()) 343 return false; 344 345 // Merge the member of an equivalence class into its class leader. This 346 // makes the members empty. 347 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 348 I != E; ++I) { 349 if (!I->isLeader()) 350 continue; 351 352 auto PartI = I->getData(); 353 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 354 ToBeMerged.member_end())) { 355 PartJ->moveTo(*PartI); 356 } 357 } 358 359 // Remove the empty partitions. 360 PartitionContainer.remove_if( 361 [](const InstPartition &P) { return P.empty(); }); 362 363 return true; 364 } 365 366 /// \brief Sets up the mapping between instructions to partitions. If the 367 /// instruction is duplicated across multiple partitions, set the entry to -1. 368 void setupPartitionIdOnInstructions() { 369 int PartitionID = 0; 370 for (const auto &Partition : PartitionContainer) { 371 for (Instruction *Inst : Partition) { 372 bool NewElt; 373 InstToPartitionIdT::iterator Iter; 374 375 std::tie(Iter, NewElt) = 376 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 377 if (!NewElt) 378 Iter->second = -1; 379 } 380 ++PartitionID; 381 } 382 } 383 384 /// \brief Populates the partition with everything that the seeding 385 /// instructions require. 386 void populateUsedSet() { 387 for (auto &P : PartitionContainer) 388 P.populateUsedSet(); 389 } 390 391 /// \brief This performs the main chunk of the work of cloning the loops for 392 /// the partitions. 393 void cloneLoops() { 394 BasicBlock *OrigPH = L->getLoopPreheader(); 395 // At this point the predecessor of the preheader is either the memcheck 396 // block or the top part of the original preheader. 397 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 398 assert(Pred && "Preheader does not have a single predecessor"); 399 BasicBlock *ExitBlock = L->getExitBlock(); 400 assert(ExitBlock && "No single exit block"); 401 Loop *NewLoop; 402 403 assert(!PartitionContainer.empty() && "at least two partitions expected"); 404 // We're cloning the preheader along with the loop so we already made sure 405 // it was empty. 406 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 407 "preheader not empty"); 408 409 // Create a loop for each partition except the last. Clone the original 410 // loop before PH along with adding a preheader for the cloned loop. Then 411 // update PH to point to the newly added preheader. 412 BasicBlock *TopPH = OrigPH; 413 unsigned Index = getSize() - 1; 414 for (auto I = std::next(PartitionContainer.rbegin()), 415 E = PartitionContainer.rend(); 416 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 417 auto *Part = &*I; 418 419 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 420 421 Part->getVMap()[ExitBlock] = TopPH; 422 Part->remapInstructions(); 423 } 424 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 425 426 // Now go in forward order and update the immediate dominator for the 427 // preheaders with the exiting block of the previous loop. Dominance 428 // within the loop is updated in cloneLoopWithPreheader. 429 for (auto Curr = PartitionContainer.cbegin(), 430 Next = std::next(PartitionContainer.cbegin()), 431 E = PartitionContainer.cend(); 432 Next != E; ++Curr, ++Next) 433 DT->changeImmediateDominator( 434 Next->getDistributedLoop()->getLoopPreheader(), 435 Curr->getDistributedLoop()->getExitingBlock()); 436 } 437 438 /// \brief Removes the dead instructions from the cloned loops. 439 void removeUnusedInsts() { 440 for (auto &Partition : PartitionContainer) 441 Partition.removeUnusedInsts(); 442 } 443 444 /// \brief For each memory pointer, it computes the partitionId the pointer is 445 /// used in. 446 /// 447 /// This returns an array of int where the I-th entry corresponds to I-th 448 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 449 /// partitions its entry is set to -1. 450 SmallVector<int, 8> 451 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 452 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 453 454 unsigned N = RtPtrCheck->Pointers.size(); 455 SmallVector<int, 8> PtrToPartitions(N); 456 for (unsigned I = 0; I < N; ++I) { 457 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 458 auto Instructions = 459 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 460 461 int &Partition = PtrToPartitions[I]; 462 // First set it to uninitialized. 463 Partition = -2; 464 for (Instruction *Inst : Instructions) { 465 // Note that this could be -1 if Inst is duplicated across multiple 466 // partitions. 467 int ThisPartition = this->InstToPartitionId[Inst]; 468 if (Partition == -2) 469 Partition = ThisPartition; 470 // -1 means belonging to multiple partitions. 471 else if (Partition == -1) 472 break; 473 else if (Partition != (int)ThisPartition) 474 Partition = -1; 475 } 476 assert(Partition != -2 && "Pointer not belonging to any partition"); 477 } 478 479 return PtrToPartitions; 480 } 481 482 void print(raw_ostream &OS) const { 483 unsigned Index = 0; 484 for (const auto &P : PartitionContainer) { 485 OS << "Partition " << Index++ << " (" << &P << "):\n"; 486 P.print(); 487 } 488 } 489 490 void dump() const { print(dbgs()); } 491 492 #ifndef NDEBUG 493 friend raw_ostream &operator<<(raw_ostream &OS, 494 const InstPartitionContainer &Partitions) { 495 Partitions.print(OS); 496 return OS; 497 } 498 #endif 499 500 void printBlocks() const { 501 unsigned Index = 0; 502 for (const auto &P : PartitionContainer) { 503 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 504 P.printBlocks(); 505 } 506 } 507 508 private: 509 typedef std::list<InstPartition> PartitionContainerT; 510 511 /// \brief List of partitions. 512 PartitionContainerT PartitionContainer; 513 514 /// \brief Mapping from Instruction to partition Id. If the instruction 515 /// belongs to multiple partitions the entry contains -1. 516 InstToPartitionIdT InstToPartitionId; 517 518 Loop *L; 519 LoopInfo *LI; 520 DominatorTree *DT; 521 522 /// \brief The control structure to merge adjacent partitions if both satisfy 523 /// the \p Predicate. 524 template <class UnaryPredicate> 525 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 526 InstPartition *PrevMatch = nullptr; 527 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 528 auto DoesMatch = Predicate(&*I); 529 if (PrevMatch == nullptr && DoesMatch) { 530 PrevMatch = &*I; 531 ++I; 532 } else if (PrevMatch != nullptr && DoesMatch) { 533 I->moveTo(*PrevMatch); 534 I = PartitionContainer.erase(I); 535 } else { 536 PrevMatch = nullptr; 537 ++I; 538 } 539 } 540 } 541 }; 542 543 /// \brief For each memory instruction, this class maintains difference of the 544 /// number of unsafe dependences that start out from this instruction minus 545 /// those that end here. 546 /// 547 /// By traversing the memory instructions in program order and accumulating this 548 /// number, we know whether any unsafe dependence crosses over a program point. 549 class MemoryInstructionDependences { 550 typedef MemoryDepChecker::Dependence Dependence; 551 552 public: 553 struct Entry { 554 Instruction *Inst; 555 unsigned NumUnsafeDependencesStartOrEnd; 556 557 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 558 }; 559 560 typedef SmallVector<Entry, 8> AccessesType; 561 562 AccessesType::const_iterator begin() const { return Accesses.begin(); } 563 AccessesType::const_iterator end() const { return Accesses.end(); } 564 565 MemoryInstructionDependences( 566 const SmallVectorImpl<Instruction *> &Instructions, 567 const SmallVectorImpl<Dependence> &Dependences) { 568 Accesses.append(Instructions.begin(), Instructions.end()); 569 570 DEBUG(dbgs() << "Backward dependences:\n"); 571 for (auto &Dep : Dependences) 572 if (Dep.isPossiblyBackward()) { 573 // Note that the designations source and destination follow the program 574 // order, i.e. source is always first. (The direction is given by the 575 // DepType.) 576 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 577 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 578 579 DEBUG(Dep.print(dbgs(), 2, Instructions)); 580 } 581 } 582 583 private: 584 AccessesType Accesses; 585 }; 586 587 /// \brief The actual class performing the per-loop work. 588 class LoopDistributeForLoop { 589 public: 590 LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, 591 DominatorTree *DT, ScalarEvolution *SE) 592 : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) { 593 setForced(); 594 } 595 596 /// \brief Try to distribute an inner-most loop. 597 bool processLoop() { 598 assert(L->empty() && "Only process inner loops."); 599 600 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 601 << "\" checking " << *L << "\n"); 602 603 BasicBlock *PH = L->getLoopPreheader(); 604 if (!PH) { 605 DEBUG(dbgs() << "Skipping; no preheader"); 606 return false; 607 } 608 if (!L->getExitBlock()) { 609 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 610 return false; 611 } 612 // LAA will check that we only have a single exiting block. 613 614 // Currently, we only distribute to isolate the part of the loop with 615 // dependence cycles to enable partial vectorization. 616 if (LAI.canVectorizeMemory()) { 617 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 618 return false; 619 } 620 auto *Dependences = LAI.getDepChecker().getDependences(); 621 if (!Dependences || Dependences->empty()) { 622 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 623 return false; 624 } 625 626 InstPartitionContainer Partitions(L, LI, DT); 627 628 // First, go through each memory operation and assign them to consecutive 629 // partitions (the order of partitions follows program order). Put those 630 // with unsafe dependences into "cyclic" partition otherwise put each store 631 // in its own "non-cyclic" partition (we'll merge these later). 632 // 633 // Note that a memory operation (e.g. Load2 below) at a program point that 634 // has an unsafe dependence (Store3->Load1) spanning over it must be 635 // included in the same cyclic partition as the dependent operations. This 636 // is to preserve the original program order after distribution. E.g.: 637 // 638 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 639 // Load1 -. 1 0->1 640 // Load2 | /Unsafe/ 0 1 641 // Store3 -' -1 1->0 642 // Load4 0 0 643 // 644 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 645 // we just keep assigning to the same cyclic partition until 646 // NumUnsafeDependencesActive reaches 0. 647 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 648 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 649 *Dependences); 650 651 int NumUnsafeDependencesActive = 0; 652 for (auto &InstDep : MID) { 653 Instruction *I = InstDep.Inst; 654 // We update NumUnsafeDependencesActive post-instruction, catch the 655 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 656 if (NumUnsafeDependencesActive || 657 InstDep.NumUnsafeDependencesStartOrEnd > 0) 658 Partitions.addToCyclicPartition(I); 659 else 660 Partitions.addToNewNonCyclicPartition(I); 661 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 662 assert(NumUnsafeDependencesActive >= 0 && 663 "Negative number of dependences active"); 664 } 665 666 // Add partitions for values used outside. These partitions can be out of 667 // order from the original program order. This is OK because if the 668 // partition uses a load we will merge this partition with the original 669 // partition of the load that we set up in the previous loop (see 670 // mergeToAvoidDuplicatedLoads). 671 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 672 for (auto *Inst : DefsUsedOutside) 673 Partitions.addToNewNonCyclicPartition(Inst); 674 675 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 676 if (Partitions.getSize() < 2) 677 return false; 678 679 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 680 // should be able to vectorize these together. 681 Partitions.mergeBeforePopulating(); 682 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 683 if (Partitions.getSize() < 2) 684 return false; 685 686 // Now, populate the partitions with non-memory operations. 687 Partitions.populateUsedSet(); 688 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 689 690 // In order to preserve original lexical order for loads, keep them in the 691 // partition that we set up in the MemoryInstructionDependences loop. 692 if (Partitions.mergeToAvoidDuplicatedLoads()) { 693 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 694 << Partitions); 695 if (Partitions.getSize() < 2) 696 return false; 697 } 698 699 // Don't distribute the loop if we need too many SCEV run-time checks. 700 const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); 701 if (Pred.getComplexity() > (IsForced.getValueOr(false) 702 ? PragmaDistributeSCEVCheckThreshold 703 : DistributeSCEVCheckThreshold)) { 704 DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); 705 return false; 706 } 707 708 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 709 // We're done forming the partitions set up the reverse mapping from 710 // instructions to partitions. 711 Partitions.setupPartitionIdOnInstructions(); 712 713 // To keep things simple have an empty preheader before we version or clone 714 // the loop. (Also split if this has no predecessor, i.e. entry, because we 715 // rely on PH having a predecessor.) 716 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 717 SplitBlock(PH, PH->getTerminator(), DT, LI); 718 719 // If we need run-time checks, version the loop now. 720 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 721 const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); 722 const auto &AllChecks = RtPtrChecking->getChecks(); 723 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 724 RtPtrChecking); 725 726 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 727 DEBUG(dbgs() << "\nPointers:\n"); 728 DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 729 LoopVersioning LVer(LAI, L, LI, DT, SE, false); 730 LVer.setAliasChecks(std::move(Checks)); 731 LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); 732 LVer.versionLoop(DefsUsedOutside); 733 LVer.annotateLoopWithNoAlias(); 734 } 735 736 // Create identical copies of the original loop for each partition and hook 737 // them up sequentially. 738 Partitions.cloneLoops(); 739 740 // Now, we remove the instruction from each loop that don't belong to that 741 // partition. 742 Partitions.removeUnusedInsts(); 743 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 744 DEBUG(Partitions.printBlocks()); 745 746 if (LDistVerify) { 747 LI->verify(); 748 DT->verifyDomTree(); 749 } 750 751 ++NumLoopsDistributed; 752 return true; 753 } 754 755 /// \brief Return if distribution forced to be enabled/disabled for the loop. 756 /// 757 /// If the optional has a value, it indicates whether distribution was forced 758 /// to be enabled (true) or disabled (false). If the optional has no value 759 /// distribution was not forced either way. 760 const Optional<bool> &isForced() const { return IsForced; } 761 762 private: 763 /// \brief Filter out checks between pointers from the same partition. 764 /// 765 /// \p PtrToPartition contains the partition number for pointers. Partition 766 /// number -1 means that the pointer is used in multiple partitions. In this 767 /// case we can't safely omit the check. 768 SmallVector<RuntimePointerChecking::PointerCheck, 4> 769 includeOnlyCrossPartitionChecks( 770 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 771 const SmallVectorImpl<int> &PtrToPartition, 772 const RuntimePointerChecking *RtPtrChecking) { 773 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 774 775 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 776 [&](const RuntimePointerChecking::PointerCheck &Check) { 777 for (unsigned PtrIdx1 : Check.first->Members) 778 for (unsigned PtrIdx2 : Check.second->Members) 779 // Only include this check if there is a pair of pointers 780 // that require checking and the pointers fall into 781 // separate partitions. 782 // 783 // (Note that we already know at this point that the two 784 // pointer groups need checking but it doesn't follow 785 // that each pair of pointers within the two groups need 786 // checking as well. 787 // 788 // In other words we don't want to include a check just 789 // because there is a pair of pointers between the two 790 // pointer groups that require checks and a different 791 // pair whose pointers fall into different partitions.) 792 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 793 !RuntimePointerChecking::arePointersInSamePartition( 794 PtrToPartition, PtrIdx1, PtrIdx2)) 795 return true; 796 return false; 797 }); 798 799 return Checks; 800 } 801 802 /// \brief Check whether the loop metadata is forcing distribution to be 803 /// enabled/disabled. 804 void setForced() { 805 Optional<const MDOperand *> Value = 806 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 807 if (!Value) 808 return; 809 810 const MDOperand *Op = *Value; 811 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 812 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 813 } 814 815 // Analyses used. 816 Loop *L; 817 LoopInfo *LI; 818 const LoopAccessInfo &LAI; 819 DominatorTree *DT; 820 ScalarEvolution *SE; 821 822 /// \brief Indicates whether distribution is forced to be enabled/disabled for 823 /// the loop. 824 /// 825 /// If the optional has a value, it indicates whether distribution was forced 826 /// to be enabled (true) or disabled (false). If the optional has no value 827 /// distribution was not forced either way. 828 Optional<bool> IsForced; 829 }; 830 831 /// \brief The pass class. 832 class LoopDistribute : public FunctionPass { 833 public: 834 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be 835 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this 836 /// default. We use this to keep LoopDistribution off by default when invoked 837 /// from the optimization pipeline but on when invoked explicitly from opt. 838 LoopDistribute(bool ProcessAllLoopsByDefault = true) 839 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { 840 // The default is set by the caller. 841 if (EnableLoopDistribute.getNumOccurrences() > 0) 842 ProcessAllLoops = EnableLoopDistribute; 843 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 844 } 845 846 bool runOnFunction(Function &F) override { 847 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 848 auto *LAA = &getAnalysis<LoopAccessAnalysis>(); 849 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 850 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 851 852 // Build up a worklist of inner-loops to vectorize. This is necessary as the 853 // act of distributing a loop creates new loops and can invalidate iterators 854 // across the loops. 855 SmallVector<Loop *, 8> Worklist; 856 857 for (Loop *TopLevelLoop : *LI) 858 for (Loop *L : depth_first(TopLevelLoop)) 859 // We only handle inner-most loops. 860 if (L->empty()) 861 Worklist.push_back(L); 862 863 // Now walk the identified inner loops. 864 bool Changed = false; 865 for (Loop *L : Worklist) { 866 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 867 LoopDistributeForLoop LDL(L, LI, LAI, DT, SE); 868 869 // If distribution was forced for the specific loop to be 870 // enabled/disabled, follow that. Otherwise use the global flag. 871 if (LDL.isForced().getValueOr(ProcessAllLoops)) 872 Changed |= LDL.processLoop(); 873 } 874 875 // Process each loop nest in the function. 876 return Changed; 877 } 878 879 void getAnalysisUsage(AnalysisUsage &AU) const override { 880 AU.addRequired<ScalarEvolutionWrapperPass>(); 881 AU.addRequired<LoopInfoWrapperPass>(); 882 AU.addPreserved<LoopInfoWrapperPass>(); 883 AU.addRequired<LoopAccessAnalysis>(); 884 AU.addRequired<DominatorTreeWrapperPass>(); 885 AU.addPreserved<DominatorTreeWrapperPass>(); 886 } 887 888 static char ID; 889 890 private: 891 /// \brief Whether distribution should be on in this function. The per-loop 892 /// pragma can override this. 893 bool ProcessAllLoops; 894 }; 895 } // anonymous namespace 896 897 char LoopDistribute::ID; 898 static const char ldist_name[] = "Loop Distribition"; 899 900 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 901 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 902 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 903 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 904 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 905 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 906 907 namespace llvm { 908 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { 909 return new LoopDistribute(ProcessAllLoopsByDefault); 910 } 911 } 912