1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/BlockFrequencyInfo.h" 30 #include "llvm/Analysis/LoopAccessAnalysis.h" 31 #include "llvm/Analysis/LoopInfo.h" 32 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/Pass.h" 36 #include "llvm/Support/CommandLine.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 #include "llvm/Transforms/Utils/Cloning.h" 40 #include "llvm/Transforms/Utils/LoopUtils.h" 41 #include "llvm/Transforms/Utils/LoopVersioning.h" 42 #include <list> 43 44 #define LDIST_NAME "loop-distribute" 45 #define DEBUG_TYPE LDIST_NAME 46 47 using namespace llvm; 48 49 static cl::opt<bool> 50 LDistVerify("loop-distribute-verify", cl::Hidden, 51 cl::desc("Turn on DominatorTree and LoopInfo verification " 52 "after Loop Distribution"), 53 cl::init(false)); 54 55 static cl::opt<bool> DistributeNonIfConvertible( 56 "loop-distribute-non-if-convertible", cl::Hidden, 57 cl::desc("Whether to distribute into a loop that may not be " 58 "if-convertible by the loop vectorizer"), 59 cl::init(false)); 60 61 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 62 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 63 cl::desc("The maximum number of SCEV checks allowed for Loop " 64 "Distribution")); 65 66 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 67 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 68 cl::Hidden, 69 cl::desc( 70 "The maximum number of SCEV checks allowed for Loop " 71 "Distribution for loop marked with #pragma loop distribute(enable)")); 72 73 // Note that the initial value for this depends on whether the pass is invoked 74 // directly or from the optimization pipeline. 75 static cl::opt<bool> EnableLoopDistribute( 76 "enable-loop-distribute", cl::Hidden, 77 cl::desc("Enable the new, experimental LoopDistribution Pass")); 78 79 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 80 81 namespace { 82 /// \brief Maintains the set of instructions of the loop for a partition before 83 /// cloning. After cloning, it hosts the new loop. 84 class InstPartition { 85 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 86 87 public: 88 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 89 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 90 Set.insert(I); 91 } 92 93 /// \brief Returns whether this partition contains a dependence cycle. 94 bool hasDepCycle() const { return DepCycle; } 95 96 /// \brief Adds an instruction to this partition. 97 void add(Instruction *I) { Set.insert(I); } 98 99 /// \brief Collection accessors. 100 InstructionSet::iterator begin() { return Set.begin(); } 101 InstructionSet::iterator end() { return Set.end(); } 102 InstructionSet::const_iterator begin() const { return Set.begin(); } 103 InstructionSet::const_iterator end() const { return Set.end(); } 104 bool empty() const { return Set.empty(); } 105 106 /// \brief Moves this partition into \p Other. This partition becomes empty 107 /// after this. 108 void moveTo(InstPartition &Other) { 109 Other.Set.insert(Set.begin(), Set.end()); 110 Set.clear(); 111 Other.DepCycle |= DepCycle; 112 } 113 114 /// \brief Populates the partition with a transitive closure of all the 115 /// instructions that the seeded instructions dependent on. 116 void populateUsedSet() { 117 // FIXME: We currently don't use control-dependence but simply include all 118 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 119 // up. 120 for (auto *B : OrigLoop->getBlocks()) 121 Set.insert(B->getTerminator()); 122 123 // Follow the use-def chains to form a transitive closure of all the 124 // instructions that the originally seeded instructions depend on. 125 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 126 while (!Worklist.empty()) { 127 Instruction *I = Worklist.pop_back_val(); 128 // Insert instructions from the loop that we depend on. 129 for (Value *V : I->operand_values()) { 130 auto *I = dyn_cast<Instruction>(V); 131 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 132 Worklist.push_back(I); 133 } 134 } 135 } 136 137 /// \brief Clones the original loop. 138 /// 139 /// Updates LoopInfo and DominatorTree using the information that block \p 140 /// LoopDomBB dominates the loop. 141 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 142 unsigned Index, LoopInfo *LI, 143 DominatorTree *DT) { 144 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 145 VMap, Twine(".ldist") + Twine(Index), 146 LI, DT, ClonedLoopBlocks); 147 return ClonedLoop; 148 } 149 150 /// \brief The cloned loop. If this partition is mapped to the original loop, 151 /// this is null. 152 const Loop *getClonedLoop() const { return ClonedLoop; } 153 154 /// \brief Returns the loop where this partition ends up after distribution. 155 /// If this partition is mapped to the original loop then use the block from 156 /// the loop. 157 const Loop *getDistributedLoop() const { 158 return ClonedLoop ? ClonedLoop : OrigLoop; 159 } 160 161 /// \brief The VMap that is populated by cloning and then used in 162 /// remapinstruction to remap the cloned instructions. 163 ValueToValueMapTy &getVMap() { return VMap; } 164 165 /// \brief Remaps the cloned instructions using VMap. 166 void remapInstructions() { 167 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 168 } 169 170 /// \brief Based on the set of instructions selected for this partition, 171 /// removes the unnecessary ones. 172 void removeUnusedInsts() { 173 SmallVector<Instruction *, 8> Unused; 174 175 for (auto *Block : OrigLoop->getBlocks()) 176 for (auto &Inst : *Block) 177 if (!Set.count(&Inst)) { 178 Instruction *NewInst = &Inst; 179 if (!VMap.empty()) 180 NewInst = cast<Instruction>(VMap[NewInst]); 181 182 assert(!isa<BranchInst>(NewInst) && 183 "Branches are marked used early on"); 184 Unused.push_back(NewInst); 185 } 186 187 // Delete the instructions backwards, as it has a reduced likelihood of 188 // having to update as many def-use and use-def chains. 189 for (auto *Inst : reverse(Unused)) { 190 if (!Inst->use_empty()) 191 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 192 Inst->eraseFromParent(); 193 } 194 } 195 196 void print() const { 197 if (DepCycle) 198 dbgs() << " (cycle)\n"; 199 for (auto *I : Set) 200 // Prefix with the block name. 201 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 202 } 203 204 void printBlocks() const { 205 for (auto *BB : getDistributedLoop()->getBlocks()) 206 dbgs() << *BB; 207 } 208 209 private: 210 /// \brief Instructions from OrigLoop selected for this partition. 211 InstructionSet Set; 212 213 /// \brief Whether this partition contains a dependence cycle. 214 bool DepCycle; 215 216 /// \brief The original loop. 217 Loop *OrigLoop; 218 219 /// \brief The cloned loop. If this partition is mapped to the original loop, 220 /// this is null. 221 Loop *ClonedLoop; 222 223 /// \brief The blocks of ClonedLoop including the preheader. If this 224 /// partition is mapped to the original loop, this is empty. 225 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 226 227 /// \brief These gets populated once the set of instructions have been 228 /// finalized. If this partition is mapped to the original loop, these are not 229 /// set. 230 ValueToValueMapTy VMap; 231 }; 232 233 /// \brief Holds the set of Partitions. It populates them, merges them and then 234 /// clones the loops. 235 class InstPartitionContainer { 236 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 237 238 public: 239 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 240 : L(L), LI(LI), DT(DT) {} 241 242 /// \brief Returns the number of partitions. 243 unsigned getSize() const { return PartitionContainer.size(); } 244 245 /// \brief Adds \p Inst into the current partition if that is marked to 246 /// contain cycles. Otherwise start a new partition for it. 247 void addToCyclicPartition(Instruction *Inst) { 248 // If the current partition is non-cyclic. Start a new one. 249 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 250 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 251 else 252 PartitionContainer.back().add(Inst); 253 } 254 255 /// \brief Adds \p Inst into a partition that is not marked to contain 256 /// dependence cycles. 257 /// 258 // Initially we isolate memory instructions into as many partitions as 259 // possible, then later we may merge them back together. 260 void addToNewNonCyclicPartition(Instruction *Inst) { 261 PartitionContainer.emplace_back(Inst, L); 262 } 263 264 /// \brief Merges adjacent non-cyclic partitions. 265 /// 266 /// The idea is that we currently only want to isolate the non-vectorizable 267 /// partition. We could later allow more distribution among these partition 268 /// too. 269 void mergeAdjacentNonCyclic() { 270 mergeAdjacentPartitionsIf( 271 [](const InstPartition *P) { return !P->hasDepCycle(); }); 272 } 273 274 /// \brief If a partition contains only conditional stores, we won't vectorize 275 /// it. Try to merge it with a previous cyclic partition. 276 void mergeNonIfConvertible() { 277 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 278 if (Partition->hasDepCycle()) 279 return true; 280 281 // Now, check if all stores are conditional in this partition. 282 bool seenStore = false; 283 284 for (auto *Inst : *Partition) 285 if (isa<StoreInst>(Inst)) { 286 seenStore = true; 287 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 288 return false; 289 } 290 return seenStore; 291 }); 292 } 293 294 /// \brief Merges the partitions according to various heuristics. 295 void mergeBeforePopulating() { 296 mergeAdjacentNonCyclic(); 297 if (!DistributeNonIfConvertible) 298 mergeNonIfConvertible(); 299 } 300 301 /// \brief Merges partitions in order to ensure that no loads are duplicated. 302 /// 303 /// We can't duplicate loads because that could potentially reorder them. 304 /// LoopAccessAnalysis provides dependency information with the context that 305 /// the order of memory operation is preserved. 306 /// 307 /// Return if any partitions were merged. 308 bool mergeToAvoidDuplicatedLoads() { 309 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 310 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 311 312 LoadToPartitionT LoadToPartition; 313 ToBeMergedT ToBeMerged; 314 315 // Step through the partitions and create equivalence between partitions 316 // that contain the same load. Also put partitions in between them in the 317 // same equivalence class to avoid reordering of memory operations. 318 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 319 E = PartitionContainer.end(); 320 I != E; ++I) { 321 auto *PartI = &*I; 322 323 // If a load occurs in two partitions PartI and PartJ, merge all 324 // partitions (PartI, PartJ] into PartI. 325 for (Instruction *Inst : *PartI) 326 if (isa<LoadInst>(Inst)) { 327 bool NewElt; 328 LoadToPartitionT::iterator LoadToPart; 329 330 std::tie(LoadToPart, NewElt) = 331 LoadToPartition.insert(std::make_pair(Inst, PartI)); 332 if (!NewElt) { 333 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 334 << "partitions: " << PartI << ", " 335 << LoadToPart->second << "\n" << *Inst << "\n"); 336 337 auto PartJ = I; 338 do { 339 --PartJ; 340 ToBeMerged.unionSets(PartI, &*PartJ); 341 } while (&*PartJ != LoadToPart->second); 342 } 343 } 344 } 345 if (ToBeMerged.empty()) 346 return false; 347 348 // Merge the member of an equivalence class into its class leader. This 349 // makes the members empty. 350 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 351 I != E; ++I) { 352 if (!I->isLeader()) 353 continue; 354 355 auto PartI = I->getData(); 356 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 357 ToBeMerged.member_end())) { 358 PartJ->moveTo(*PartI); 359 } 360 } 361 362 // Remove the empty partitions. 363 PartitionContainer.remove_if( 364 [](const InstPartition &P) { return P.empty(); }); 365 366 return true; 367 } 368 369 /// \brief Sets up the mapping between instructions to partitions. If the 370 /// instruction is duplicated across multiple partitions, set the entry to -1. 371 void setupPartitionIdOnInstructions() { 372 int PartitionID = 0; 373 for (const auto &Partition : PartitionContainer) { 374 for (Instruction *Inst : Partition) { 375 bool NewElt; 376 InstToPartitionIdT::iterator Iter; 377 378 std::tie(Iter, NewElt) = 379 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 380 if (!NewElt) 381 Iter->second = -1; 382 } 383 ++PartitionID; 384 } 385 } 386 387 /// \brief Populates the partition with everything that the seeding 388 /// instructions require. 389 void populateUsedSet() { 390 for (auto &P : PartitionContainer) 391 P.populateUsedSet(); 392 } 393 394 /// \brief This performs the main chunk of the work of cloning the loops for 395 /// the partitions. 396 void cloneLoops() { 397 BasicBlock *OrigPH = L->getLoopPreheader(); 398 // At this point the predecessor of the preheader is either the memcheck 399 // block or the top part of the original preheader. 400 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 401 assert(Pred && "Preheader does not have a single predecessor"); 402 BasicBlock *ExitBlock = L->getExitBlock(); 403 assert(ExitBlock && "No single exit block"); 404 Loop *NewLoop; 405 406 assert(!PartitionContainer.empty() && "at least two partitions expected"); 407 // We're cloning the preheader along with the loop so we already made sure 408 // it was empty. 409 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 410 "preheader not empty"); 411 412 // Create a loop for each partition except the last. Clone the original 413 // loop before PH along with adding a preheader for the cloned loop. Then 414 // update PH to point to the newly added preheader. 415 BasicBlock *TopPH = OrigPH; 416 unsigned Index = getSize() - 1; 417 for (auto I = std::next(PartitionContainer.rbegin()), 418 E = PartitionContainer.rend(); 419 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 420 auto *Part = &*I; 421 422 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 423 424 Part->getVMap()[ExitBlock] = TopPH; 425 Part->remapInstructions(); 426 } 427 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 428 429 // Now go in forward order and update the immediate dominator for the 430 // preheaders with the exiting block of the previous loop. Dominance 431 // within the loop is updated in cloneLoopWithPreheader. 432 for (auto Curr = PartitionContainer.cbegin(), 433 Next = std::next(PartitionContainer.cbegin()), 434 E = PartitionContainer.cend(); 435 Next != E; ++Curr, ++Next) 436 DT->changeImmediateDominator( 437 Next->getDistributedLoop()->getLoopPreheader(), 438 Curr->getDistributedLoop()->getExitingBlock()); 439 } 440 441 /// \brief Removes the dead instructions from the cloned loops. 442 void removeUnusedInsts() { 443 for (auto &Partition : PartitionContainer) 444 Partition.removeUnusedInsts(); 445 } 446 447 /// \brief For each memory pointer, it computes the partitionId the pointer is 448 /// used in. 449 /// 450 /// This returns an array of int where the I-th entry corresponds to I-th 451 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 452 /// partitions its entry is set to -1. 453 SmallVector<int, 8> 454 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 455 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 456 457 unsigned N = RtPtrCheck->Pointers.size(); 458 SmallVector<int, 8> PtrToPartitions(N); 459 for (unsigned I = 0; I < N; ++I) { 460 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 461 auto Instructions = 462 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 463 464 int &Partition = PtrToPartitions[I]; 465 // First set it to uninitialized. 466 Partition = -2; 467 for (Instruction *Inst : Instructions) { 468 // Note that this could be -1 if Inst is duplicated across multiple 469 // partitions. 470 int ThisPartition = this->InstToPartitionId[Inst]; 471 if (Partition == -2) 472 Partition = ThisPartition; 473 // -1 means belonging to multiple partitions. 474 else if (Partition == -1) 475 break; 476 else if (Partition != (int)ThisPartition) 477 Partition = -1; 478 } 479 assert(Partition != -2 && "Pointer not belonging to any partition"); 480 } 481 482 return PtrToPartitions; 483 } 484 485 void print(raw_ostream &OS) const { 486 unsigned Index = 0; 487 for (const auto &P : PartitionContainer) { 488 OS << "Partition " << Index++ << " (" << &P << "):\n"; 489 P.print(); 490 } 491 } 492 493 void dump() const { print(dbgs()); } 494 495 #ifndef NDEBUG 496 friend raw_ostream &operator<<(raw_ostream &OS, 497 const InstPartitionContainer &Partitions) { 498 Partitions.print(OS); 499 return OS; 500 } 501 #endif 502 503 void printBlocks() const { 504 unsigned Index = 0; 505 for (const auto &P : PartitionContainer) { 506 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 507 P.printBlocks(); 508 } 509 } 510 511 private: 512 typedef std::list<InstPartition> PartitionContainerT; 513 514 /// \brief List of partitions. 515 PartitionContainerT PartitionContainer; 516 517 /// \brief Mapping from Instruction to partition Id. If the instruction 518 /// belongs to multiple partitions the entry contains -1. 519 InstToPartitionIdT InstToPartitionId; 520 521 Loop *L; 522 LoopInfo *LI; 523 DominatorTree *DT; 524 525 /// \brief The control structure to merge adjacent partitions if both satisfy 526 /// the \p Predicate. 527 template <class UnaryPredicate> 528 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 529 InstPartition *PrevMatch = nullptr; 530 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 531 auto DoesMatch = Predicate(&*I); 532 if (PrevMatch == nullptr && DoesMatch) { 533 PrevMatch = &*I; 534 ++I; 535 } else if (PrevMatch != nullptr && DoesMatch) { 536 I->moveTo(*PrevMatch); 537 I = PartitionContainer.erase(I); 538 } else { 539 PrevMatch = nullptr; 540 ++I; 541 } 542 } 543 } 544 }; 545 546 /// \brief For each memory instruction, this class maintains difference of the 547 /// number of unsafe dependences that start out from this instruction minus 548 /// those that end here. 549 /// 550 /// By traversing the memory instructions in program order and accumulating this 551 /// number, we know whether any unsafe dependence crosses over a program point. 552 class MemoryInstructionDependences { 553 typedef MemoryDepChecker::Dependence Dependence; 554 555 public: 556 struct Entry { 557 Instruction *Inst; 558 unsigned NumUnsafeDependencesStartOrEnd; 559 560 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 561 }; 562 563 typedef SmallVector<Entry, 8> AccessesType; 564 565 AccessesType::const_iterator begin() const { return Accesses.begin(); } 566 AccessesType::const_iterator end() const { return Accesses.end(); } 567 568 MemoryInstructionDependences( 569 const SmallVectorImpl<Instruction *> &Instructions, 570 const SmallVectorImpl<Dependence> &Dependences) { 571 Accesses.append(Instructions.begin(), Instructions.end()); 572 573 DEBUG(dbgs() << "Backward dependences:\n"); 574 for (auto &Dep : Dependences) 575 if (Dep.isPossiblyBackward()) { 576 // Note that the designations source and destination follow the program 577 // order, i.e. source is always first. (The direction is given by the 578 // DepType.) 579 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 580 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 581 582 DEBUG(Dep.print(dbgs(), 2, Instructions)); 583 } 584 } 585 586 private: 587 AccessesType Accesses; 588 }; 589 590 /// \brief The actual class performing the per-loop work. 591 class LoopDistributeForLoop { 592 public: 593 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 594 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 595 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { 596 setForced(); 597 } 598 599 /// \brief Try to distribute an inner-most loop. 600 bool processLoop(LoopAccessLegacyAnalysis *LAA) { 601 assert(L->empty() && "Only process inner loops."); 602 603 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 604 << "\" checking " << *L << "\n"); 605 606 BasicBlock *PH = L->getLoopPreheader(); 607 if (!PH) 608 return fail("no preheader"); 609 if (!L->getExitBlock()) 610 return fail("multiple exit blocks"); 611 612 // LAA will check that we only have a single exiting block. 613 LAI = &LAA->getInfo(L); 614 615 // Currently, we only distribute to isolate the part of the loop with 616 // dependence cycles to enable partial vectorization. 617 if (LAI->canVectorizeMemory()) 618 return fail("memory operations are safe for vectorization"); 619 620 auto *Dependences = LAI->getDepChecker().getDependences(); 621 if (!Dependences || Dependences->empty()) 622 return fail("no unsafe dependences to isolate"); 623 624 InstPartitionContainer Partitions(L, LI, DT); 625 626 // First, go through each memory operation and assign them to consecutive 627 // partitions (the order of partitions follows program order). Put those 628 // with unsafe dependences into "cyclic" partition otherwise put each store 629 // in its own "non-cyclic" partition (we'll merge these later). 630 // 631 // Note that a memory operation (e.g. Load2 below) at a program point that 632 // has an unsafe dependence (Store3->Load1) spanning over it must be 633 // included in the same cyclic partition as the dependent operations. This 634 // is to preserve the original program order after distribution. E.g.: 635 // 636 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 637 // Load1 -. 1 0->1 638 // Load2 | /Unsafe/ 0 1 639 // Store3 -' -1 1->0 640 // Load4 0 0 641 // 642 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 643 // we just keep assigning to the same cyclic partition until 644 // NumUnsafeDependencesActive reaches 0. 645 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 646 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 647 *Dependences); 648 649 int NumUnsafeDependencesActive = 0; 650 for (auto &InstDep : MID) { 651 Instruction *I = InstDep.Inst; 652 // We update NumUnsafeDependencesActive post-instruction, catch the 653 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 654 if (NumUnsafeDependencesActive || 655 InstDep.NumUnsafeDependencesStartOrEnd > 0) 656 Partitions.addToCyclicPartition(I); 657 else 658 Partitions.addToNewNonCyclicPartition(I); 659 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 660 assert(NumUnsafeDependencesActive >= 0 && 661 "Negative number of dependences active"); 662 } 663 664 // Add partitions for values used outside. These partitions can be out of 665 // order from the original program order. This is OK because if the 666 // partition uses a load we will merge this partition with the original 667 // partition of the load that we set up in the previous loop (see 668 // mergeToAvoidDuplicatedLoads). 669 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 670 for (auto *Inst : DefsUsedOutside) 671 Partitions.addToNewNonCyclicPartition(Inst); 672 673 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 674 if (Partitions.getSize() < 2) 675 return fail("cannot isolate unsafe dependencies"); 676 677 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 678 // should be able to vectorize these together. 679 Partitions.mergeBeforePopulating(); 680 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 681 if (Partitions.getSize() < 2) 682 return fail("cannot isolate unsafe dependencies"); 683 684 // Now, populate the partitions with non-memory operations. 685 Partitions.populateUsedSet(); 686 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 687 688 // In order to preserve original lexical order for loads, keep them in the 689 // partition that we set up in the MemoryInstructionDependences loop. 690 if (Partitions.mergeToAvoidDuplicatedLoads()) { 691 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 692 << Partitions); 693 if (Partitions.getSize() < 2) 694 return fail("cannot isolate unsafe dependencies"); 695 } 696 697 // Don't distribute the loop if we need too many SCEV run-time checks. 698 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 699 if (Pred.getComplexity() > (IsForced.getValueOr(false) 700 ? PragmaDistributeSCEVCheckThreshold 701 : DistributeSCEVCheckThreshold)) 702 return fail("too many SCEV run-time checks needed.\n"); 703 704 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 705 // We're done forming the partitions set up the reverse mapping from 706 // instructions to partitions. 707 Partitions.setupPartitionIdOnInstructions(); 708 709 // To keep things simple have an empty preheader before we version or clone 710 // the loop. (Also split if this has no predecessor, i.e. entry, because we 711 // rely on PH having a predecessor.) 712 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 713 SplitBlock(PH, PH->getTerminator(), DT, LI); 714 715 // If we need run-time checks, version the loop now. 716 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 717 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 718 const auto &AllChecks = RtPtrChecking->getChecks(); 719 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 720 RtPtrChecking); 721 722 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 723 DEBUG(dbgs() << "\nPointers:\n"); 724 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 725 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 726 LVer.setAliasChecks(std::move(Checks)); 727 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 728 LVer.versionLoop(DefsUsedOutside); 729 LVer.annotateLoopWithNoAlias(); 730 } 731 732 // Create identical copies of the original loop for each partition and hook 733 // them up sequentially. 734 Partitions.cloneLoops(); 735 736 // Now, we remove the instruction from each loop that don't belong to that 737 // partition. 738 Partitions.removeUnusedInsts(); 739 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 740 DEBUG(Partitions.printBlocks()); 741 742 if (LDistVerify) { 743 LI->verify(); 744 DT->verifyDomTree(); 745 } 746 747 ++NumLoopsDistributed; 748 // Report the success. 749 emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(), 750 "distributed loop"); 751 return true; 752 } 753 754 /// \brief Provide diagnostics then \return with false. 755 bool fail(llvm::StringRef Message) { 756 LLVMContext &Ctx = F->getContext(); 757 bool Forced = isForced().getValueOr(false); 758 759 DEBUG(dbgs() << "Skipping; " << Message << "\n"); 760 761 // With Rpass-missed report that distribution failed. 762 ORE->emitOptimizationRemarkMissed( 763 LDIST_NAME, L, 764 "loop not distributed: use -Rpass-analysis=loop-distribute for more " 765 "info"); 766 767 // With Rpass-analysis report why. This is on by default if distribution 768 // was requested explicitly. 769 emitOptimizationRemarkAnalysis( 770 Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint 771 : LDIST_NAME, 772 *F, L->getStartLoc(), Twine("loop not distributed: ") + Message); 773 774 // Also issue a warning if distribution was requested explicitly but it 775 // failed. 776 if (Forced) 777 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 778 *F, L->getStartLoc(), "loop not distributed: failed " 779 "explicitly specified loop distribution")); 780 781 return false; 782 } 783 784 /// \brief Return if distribution forced to be enabled/disabled for the loop. 785 /// 786 /// If the optional has a value, it indicates whether distribution was forced 787 /// to be enabled (true) or disabled (false). If the optional has no value 788 /// distribution was not forced either way. 789 const Optional<bool> &isForced() const { return IsForced; } 790 791 private: 792 /// \brief Filter out checks between pointers from the same partition. 793 /// 794 /// \p PtrToPartition contains the partition number for pointers. Partition 795 /// number -1 means that the pointer is used in multiple partitions. In this 796 /// case we can't safely omit the check. 797 SmallVector<RuntimePointerChecking::PointerCheck, 4> 798 includeOnlyCrossPartitionChecks( 799 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 800 const SmallVectorImpl<int> &PtrToPartition, 801 const RuntimePointerChecking *RtPtrChecking) { 802 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 803 804 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 805 [&](const RuntimePointerChecking::PointerCheck &Check) { 806 for (unsigned PtrIdx1 : Check.first->Members) 807 for (unsigned PtrIdx2 : Check.second->Members) 808 // Only include this check if there is a pair of pointers 809 // that require checking and the pointers fall into 810 // separate partitions. 811 // 812 // (Note that we already know at this point that the two 813 // pointer groups need checking but it doesn't follow 814 // that each pair of pointers within the two groups need 815 // checking as well. 816 // 817 // In other words we don't want to include a check just 818 // because there is a pair of pointers between the two 819 // pointer groups that require checks and a different 820 // pair whose pointers fall into different partitions.) 821 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 822 !RuntimePointerChecking::arePointersInSamePartition( 823 PtrToPartition, PtrIdx1, PtrIdx2)) 824 return true; 825 return false; 826 }); 827 828 return Checks; 829 } 830 831 /// \brief Check whether the loop metadata is forcing distribution to be 832 /// enabled/disabled. 833 void setForced() { 834 Optional<const MDOperand *> Value = 835 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 836 if (!Value) 837 return; 838 839 const MDOperand *Op = *Value; 840 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 841 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 842 } 843 844 Loop *L; 845 Function *F; 846 847 // Analyses used. 848 LoopInfo *LI; 849 const LoopAccessInfo *LAI; 850 DominatorTree *DT; 851 ScalarEvolution *SE; 852 OptimizationRemarkEmitter *ORE; 853 854 /// \brief Indicates whether distribution is forced to be enabled/disabled for 855 /// the loop. 856 /// 857 /// If the optional has a value, it indicates whether distribution was forced 858 /// to be enabled (true) or disabled (false). If the optional has no value 859 /// distribution was not forced either way. 860 Optional<bool> IsForced; 861 }; 862 863 /// \brief The pass class. 864 class LoopDistribute : public FunctionPass { 865 public: 866 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be 867 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this 868 /// default. We use this to keep LoopDistribution off by default when invoked 869 /// from the optimization pipeline but on when invoked explicitly from opt. 870 LoopDistribute(bool ProcessAllLoopsByDefault = true) 871 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { 872 // The default is set by the caller. 873 if (EnableLoopDistribute.getNumOccurrences() > 0) 874 ProcessAllLoops = EnableLoopDistribute; 875 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 876 } 877 878 bool runOnFunction(Function &F) override { 879 if (skipFunction(F)) 880 return false; 881 882 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 883 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 884 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 885 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 886 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 887 888 // Build up a worklist of inner-loops to vectorize. This is necessary as the 889 // act of distributing a loop creates new loops and can invalidate iterators 890 // across the loops. 891 SmallVector<Loop *, 8> Worklist; 892 893 for (Loop *TopLevelLoop : *LI) 894 for (Loop *L : depth_first(TopLevelLoop)) 895 // We only handle inner-most loops. 896 if (L->empty()) 897 Worklist.push_back(L); 898 899 // Now walk the identified inner loops. 900 bool Changed = false; 901 for (Loop *L : Worklist) { 902 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 903 904 // If distribution was forced for the specific loop to be 905 // enabled/disabled, follow that. Otherwise use the global flag. 906 if (LDL.isForced().getValueOr(ProcessAllLoops)) 907 Changed |= LDL.processLoop(LAA); 908 } 909 910 // Process each loop nest in the function. 911 return Changed; 912 } 913 914 void getAnalysisUsage(AnalysisUsage &AU) const override { 915 AU.addRequired<ScalarEvolutionWrapperPass>(); 916 AU.addRequired<LoopInfoWrapperPass>(); 917 AU.addPreserved<LoopInfoWrapperPass>(); 918 AU.addRequired<LoopAccessLegacyAnalysis>(); 919 AU.addRequired<DominatorTreeWrapperPass>(); 920 AU.addPreserved<DominatorTreeWrapperPass>(); 921 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 922 } 923 924 static char ID; 925 926 private: 927 /// \brief Whether distribution should be on in this function. The per-loop 928 /// pragma can override this. 929 bool ProcessAllLoops; 930 }; 931 } // anonymous namespace 932 933 char LoopDistribute::ID; 934 static const char ldist_name[] = "Loop Distribition"; 935 936 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 937 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 938 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 939 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 940 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 941 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 942 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 943 944 namespace llvm { 945 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { 946 return new LoopDistribute(ProcessAllLoopsByDefault); 947 } 948 } 949