1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/Transforms/Scalar/LoopDistribute.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/Statistic.h" 30 #include "llvm/Analysis/BlockFrequencyInfo.h" 31 #include "llvm/Analysis/GlobalsModRef.h" 32 #include "llvm/Analysis/LoopAccessAnalysis.h" 33 #include "llvm/Analysis/LoopInfo.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/Pass.h" 38 #include "llvm/Support/CommandLine.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Transforms/Scalar/LoopPassManager.h" 41 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 42 #include "llvm/Transforms/Utils/Cloning.h" 43 #include "llvm/Transforms/Utils/LoopUtils.h" 44 #include "llvm/Transforms/Utils/LoopVersioning.h" 45 #include <list> 46 47 #define LDIST_NAME "loop-distribute" 48 #define DEBUG_TYPE LDIST_NAME 49 50 using namespace llvm; 51 52 static cl::opt<bool> 53 LDistVerify("loop-distribute-verify", cl::Hidden, 54 cl::desc("Turn on DominatorTree and LoopInfo verification " 55 "after Loop Distribution"), 56 cl::init(false)); 57 58 static cl::opt<bool> DistributeNonIfConvertible( 59 "loop-distribute-non-if-convertible", cl::Hidden, 60 cl::desc("Whether to distribute into a loop that may not be " 61 "if-convertible by the loop vectorizer"), 62 cl::init(false)); 63 64 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 65 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 66 cl::desc("The maximum number of SCEV checks allowed for Loop " 67 "Distribution")); 68 69 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 70 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 71 cl::Hidden, 72 cl::desc( 73 "The maximum number of SCEV checks allowed for Loop " 74 "Distribution for loop marked with #pragma loop distribute(enable)")); 75 76 static cl::opt<bool> EnableLoopDistribute( 77 "enable-loop-distribute", cl::Hidden, 78 cl::desc("Enable the new, experimental LoopDistribution Pass"), 79 cl::init(false)); 80 81 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 82 83 namespace { 84 /// \brief Maintains the set of instructions of the loop for a partition before 85 /// cloning. After cloning, it hosts the new loop. 86 class InstPartition { 87 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 88 89 public: 90 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 91 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 92 Set.insert(I); 93 } 94 95 /// \brief Returns whether this partition contains a dependence cycle. 96 bool hasDepCycle() const { return DepCycle; } 97 98 /// \brief Adds an instruction to this partition. 99 void add(Instruction *I) { Set.insert(I); } 100 101 /// \brief Collection accessors. 102 InstructionSet::iterator begin() { return Set.begin(); } 103 InstructionSet::iterator end() { return Set.end(); } 104 InstructionSet::const_iterator begin() const { return Set.begin(); } 105 InstructionSet::const_iterator end() const { return Set.end(); } 106 bool empty() const { return Set.empty(); } 107 108 /// \brief Moves this partition into \p Other. This partition becomes empty 109 /// after this. 110 void moveTo(InstPartition &Other) { 111 Other.Set.insert(Set.begin(), Set.end()); 112 Set.clear(); 113 Other.DepCycle |= DepCycle; 114 } 115 116 /// \brief Populates the partition with a transitive closure of all the 117 /// instructions that the seeded instructions dependent on. 118 void populateUsedSet() { 119 // FIXME: We currently don't use control-dependence but simply include all 120 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 121 // up. 122 for (auto *B : OrigLoop->getBlocks()) 123 Set.insert(B->getTerminator()); 124 125 // Follow the use-def chains to form a transitive closure of all the 126 // instructions that the originally seeded instructions depend on. 127 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 128 while (!Worklist.empty()) { 129 Instruction *I = Worklist.pop_back_val(); 130 // Insert instructions from the loop that we depend on. 131 for (Value *V : I->operand_values()) { 132 auto *I = dyn_cast<Instruction>(V); 133 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 134 Worklist.push_back(I); 135 } 136 } 137 } 138 139 /// \brief Clones the original loop. 140 /// 141 /// Updates LoopInfo and DominatorTree using the information that block \p 142 /// LoopDomBB dominates the loop. 143 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 144 unsigned Index, LoopInfo *LI, 145 DominatorTree *DT) { 146 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 147 VMap, Twine(".ldist") + Twine(Index), 148 LI, DT, ClonedLoopBlocks); 149 return ClonedLoop; 150 } 151 152 /// \brief The cloned loop. If this partition is mapped to the original loop, 153 /// this is null. 154 const Loop *getClonedLoop() const { return ClonedLoop; } 155 156 /// \brief Returns the loop where this partition ends up after distribution. 157 /// If this partition is mapped to the original loop then use the block from 158 /// the loop. 159 const Loop *getDistributedLoop() const { 160 return ClonedLoop ? ClonedLoop : OrigLoop; 161 } 162 163 /// \brief The VMap that is populated by cloning and then used in 164 /// remapinstruction to remap the cloned instructions. 165 ValueToValueMapTy &getVMap() { return VMap; } 166 167 /// \brief Remaps the cloned instructions using VMap. 168 void remapInstructions() { 169 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 170 } 171 172 /// \brief Based on the set of instructions selected for this partition, 173 /// removes the unnecessary ones. 174 void removeUnusedInsts() { 175 SmallVector<Instruction *, 8> Unused; 176 177 for (auto *Block : OrigLoop->getBlocks()) 178 for (auto &Inst : *Block) 179 if (!Set.count(&Inst)) { 180 Instruction *NewInst = &Inst; 181 if (!VMap.empty()) 182 NewInst = cast<Instruction>(VMap[NewInst]); 183 184 assert(!isa<BranchInst>(NewInst) && 185 "Branches are marked used early on"); 186 Unused.push_back(NewInst); 187 } 188 189 // Delete the instructions backwards, as it has a reduced likelihood of 190 // having to update as many def-use and use-def chains. 191 for (auto *Inst : reverse(Unused)) { 192 if (!Inst->use_empty()) 193 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 194 Inst->eraseFromParent(); 195 } 196 } 197 198 void print() const { 199 if (DepCycle) 200 dbgs() << " (cycle)\n"; 201 for (auto *I : Set) 202 // Prefix with the block name. 203 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 204 } 205 206 void printBlocks() const { 207 for (auto *BB : getDistributedLoop()->getBlocks()) 208 dbgs() << *BB; 209 } 210 211 private: 212 /// \brief Instructions from OrigLoop selected for this partition. 213 InstructionSet Set; 214 215 /// \brief Whether this partition contains a dependence cycle. 216 bool DepCycle; 217 218 /// \brief The original loop. 219 Loop *OrigLoop; 220 221 /// \brief The cloned loop. If this partition is mapped to the original loop, 222 /// this is null. 223 Loop *ClonedLoop; 224 225 /// \brief The blocks of ClonedLoop including the preheader. If this 226 /// partition is mapped to the original loop, this is empty. 227 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 228 229 /// \brief These gets populated once the set of instructions have been 230 /// finalized. If this partition is mapped to the original loop, these are not 231 /// set. 232 ValueToValueMapTy VMap; 233 }; 234 235 /// \brief Holds the set of Partitions. It populates them, merges them and then 236 /// clones the loops. 237 class InstPartitionContainer { 238 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 239 240 public: 241 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 242 : L(L), LI(LI), DT(DT) {} 243 244 /// \brief Returns the number of partitions. 245 unsigned getSize() const { return PartitionContainer.size(); } 246 247 /// \brief Adds \p Inst into the current partition if that is marked to 248 /// contain cycles. Otherwise start a new partition for it. 249 void addToCyclicPartition(Instruction *Inst) { 250 // If the current partition is non-cyclic. Start a new one. 251 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 252 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 253 else 254 PartitionContainer.back().add(Inst); 255 } 256 257 /// \brief Adds \p Inst into a partition that is not marked to contain 258 /// dependence cycles. 259 /// 260 // Initially we isolate memory instructions into as many partitions as 261 // possible, then later we may merge them back together. 262 void addToNewNonCyclicPartition(Instruction *Inst) { 263 PartitionContainer.emplace_back(Inst, L); 264 } 265 266 /// \brief Merges adjacent non-cyclic partitions. 267 /// 268 /// The idea is that we currently only want to isolate the non-vectorizable 269 /// partition. We could later allow more distribution among these partition 270 /// too. 271 void mergeAdjacentNonCyclic() { 272 mergeAdjacentPartitionsIf( 273 [](const InstPartition *P) { return !P->hasDepCycle(); }); 274 } 275 276 /// \brief If a partition contains only conditional stores, we won't vectorize 277 /// it. Try to merge it with a previous cyclic partition. 278 void mergeNonIfConvertible() { 279 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 280 if (Partition->hasDepCycle()) 281 return true; 282 283 // Now, check if all stores are conditional in this partition. 284 bool seenStore = false; 285 286 for (auto *Inst : *Partition) 287 if (isa<StoreInst>(Inst)) { 288 seenStore = true; 289 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 290 return false; 291 } 292 return seenStore; 293 }); 294 } 295 296 /// \brief Merges the partitions according to various heuristics. 297 void mergeBeforePopulating() { 298 mergeAdjacentNonCyclic(); 299 if (!DistributeNonIfConvertible) 300 mergeNonIfConvertible(); 301 } 302 303 /// \brief Merges partitions in order to ensure that no loads are duplicated. 304 /// 305 /// We can't duplicate loads because that could potentially reorder them. 306 /// LoopAccessAnalysis provides dependency information with the context that 307 /// the order of memory operation is preserved. 308 /// 309 /// Return if any partitions were merged. 310 bool mergeToAvoidDuplicatedLoads() { 311 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 312 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 313 314 LoadToPartitionT LoadToPartition; 315 ToBeMergedT ToBeMerged; 316 317 // Step through the partitions and create equivalence between partitions 318 // that contain the same load. Also put partitions in between them in the 319 // same equivalence class to avoid reordering of memory operations. 320 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 321 E = PartitionContainer.end(); 322 I != E; ++I) { 323 auto *PartI = &*I; 324 325 // If a load occurs in two partitions PartI and PartJ, merge all 326 // partitions (PartI, PartJ] into PartI. 327 for (Instruction *Inst : *PartI) 328 if (isa<LoadInst>(Inst)) { 329 bool NewElt; 330 LoadToPartitionT::iterator LoadToPart; 331 332 std::tie(LoadToPart, NewElt) = 333 LoadToPartition.insert(std::make_pair(Inst, PartI)); 334 if (!NewElt) { 335 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 336 << "partitions: " << PartI << ", " 337 << LoadToPart->second << "\n" << *Inst << "\n"); 338 339 auto PartJ = I; 340 do { 341 --PartJ; 342 ToBeMerged.unionSets(PartI, &*PartJ); 343 } while (&*PartJ != LoadToPart->second); 344 } 345 } 346 } 347 if (ToBeMerged.empty()) 348 return false; 349 350 // Merge the member of an equivalence class into its class leader. This 351 // makes the members empty. 352 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 353 I != E; ++I) { 354 if (!I->isLeader()) 355 continue; 356 357 auto PartI = I->getData(); 358 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 359 ToBeMerged.member_end())) { 360 PartJ->moveTo(*PartI); 361 } 362 } 363 364 // Remove the empty partitions. 365 PartitionContainer.remove_if( 366 [](const InstPartition &P) { return P.empty(); }); 367 368 return true; 369 } 370 371 /// \brief Sets up the mapping between instructions to partitions. If the 372 /// instruction is duplicated across multiple partitions, set the entry to -1. 373 void setupPartitionIdOnInstructions() { 374 int PartitionID = 0; 375 for (const auto &Partition : PartitionContainer) { 376 for (Instruction *Inst : Partition) { 377 bool NewElt; 378 InstToPartitionIdT::iterator Iter; 379 380 std::tie(Iter, NewElt) = 381 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 382 if (!NewElt) 383 Iter->second = -1; 384 } 385 ++PartitionID; 386 } 387 } 388 389 /// \brief Populates the partition with everything that the seeding 390 /// instructions require. 391 void populateUsedSet() { 392 for (auto &P : PartitionContainer) 393 P.populateUsedSet(); 394 } 395 396 /// \brief This performs the main chunk of the work of cloning the loops for 397 /// the partitions. 398 void cloneLoops() { 399 BasicBlock *OrigPH = L->getLoopPreheader(); 400 // At this point the predecessor of the preheader is either the memcheck 401 // block or the top part of the original preheader. 402 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 403 assert(Pred && "Preheader does not have a single predecessor"); 404 BasicBlock *ExitBlock = L->getExitBlock(); 405 assert(ExitBlock && "No single exit block"); 406 Loop *NewLoop; 407 408 assert(!PartitionContainer.empty() && "at least two partitions expected"); 409 // We're cloning the preheader along with the loop so we already made sure 410 // it was empty. 411 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 412 "preheader not empty"); 413 414 // Create a loop for each partition except the last. Clone the original 415 // loop before PH along with adding a preheader for the cloned loop. Then 416 // update PH to point to the newly added preheader. 417 BasicBlock *TopPH = OrigPH; 418 unsigned Index = getSize() - 1; 419 for (auto I = std::next(PartitionContainer.rbegin()), 420 E = PartitionContainer.rend(); 421 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 422 auto *Part = &*I; 423 424 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 425 426 Part->getVMap()[ExitBlock] = TopPH; 427 Part->remapInstructions(); 428 } 429 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 430 431 // Now go in forward order and update the immediate dominator for the 432 // preheaders with the exiting block of the previous loop. Dominance 433 // within the loop is updated in cloneLoopWithPreheader. 434 for (auto Curr = PartitionContainer.cbegin(), 435 Next = std::next(PartitionContainer.cbegin()), 436 E = PartitionContainer.cend(); 437 Next != E; ++Curr, ++Next) 438 DT->changeImmediateDominator( 439 Next->getDistributedLoop()->getLoopPreheader(), 440 Curr->getDistributedLoop()->getExitingBlock()); 441 } 442 443 /// \brief Removes the dead instructions from the cloned loops. 444 void removeUnusedInsts() { 445 for (auto &Partition : PartitionContainer) 446 Partition.removeUnusedInsts(); 447 } 448 449 /// \brief For each memory pointer, it computes the partitionId the pointer is 450 /// used in. 451 /// 452 /// This returns an array of int where the I-th entry corresponds to I-th 453 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 454 /// partitions its entry is set to -1. 455 SmallVector<int, 8> 456 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 457 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 458 459 unsigned N = RtPtrCheck->Pointers.size(); 460 SmallVector<int, 8> PtrToPartitions(N); 461 for (unsigned I = 0; I < N; ++I) { 462 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 463 auto Instructions = 464 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 465 466 int &Partition = PtrToPartitions[I]; 467 // First set it to uninitialized. 468 Partition = -2; 469 for (Instruction *Inst : Instructions) { 470 // Note that this could be -1 if Inst is duplicated across multiple 471 // partitions. 472 int ThisPartition = this->InstToPartitionId[Inst]; 473 if (Partition == -2) 474 Partition = ThisPartition; 475 // -1 means belonging to multiple partitions. 476 else if (Partition == -1) 477 break; 478 else if (Partition != (int)ThisPartition) 479 Partition = -1; 480 } 481 assert(Partition != -2 && "Pointer not belonging to any partition"); 482 } 483 484 return PtrToPartitions; 485 } 486 487 void print(raw_ostream &OS) const { 488 unsigned Index = 0; 489 for (const auto &P : PartitionContainer) { 490 OS << "Partition " << Index++ << " (" << &P << "):\n"; 491 P.print(); 492 } 493 } 494 495 void dump() const { print(dbgs()); } 496 497 #ifndef NDEBUG 498 friend raw_ostream &operator<<(raw_ostream &OS, 499 const InstPartitionContainer &Partitions) { 500 Partitions.print(OS); 501 return OS; 502 } 503 #endif 504 505 void printBlocks() const { 506 unsigned Index = 0; 507 for (const auto &P : PartitionContainer) { 508 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 509 P.printBlocks(); 510 } 511 } 512 513 private: 514 typedef std::list<InstPartition> PartitionContainerT; 515 516 /// \brief List of partitions. 517 PartitionContainerT PartitionContainer; 518 519 /// \brief Mapping from Instruction to partition Id. If the instruction 520 /// belongs to multiple partitions the entry contains -1. 521 InstToPartitionIdT InstToPartitionId; 522 523 Loop *L; 524 LoopInfo *LI; 525 DominatorTree *DT; 526 527 /// \brief The control structure to merge adjacent partitions if both satisfy 528 /// the \p Predicate. 529 template <class UnaryPredicate> 530 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 531 InstPartition *PrevMatch = nullptr; 532 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 533 auto DoesMatch = Predicate(&*I); 534 if (PrevMatch == nullptr && DoesMatch) { 535 PrevMatch = &*I; 536 ++I; 537 } else if (PrevMatch != nullptr && DoesMatch) { 538 I->moveTo(*PrevMatch); 539 I = PartitionContainer.erase(I); 540 } else { 541 PrevMatch = nullptr; 542 ++I; 543 } 544 } 545 } 546 }; 547 548 /// \brief For each memory instruction, this class maintains difference of the 549 /// number of unsafe dependences that start out from this instruction minus 550 /// those that end here. 551 /// 552 /// By traversing the memory instructions in program order and accumulating this 553 /// number, we know whether any unsafe dependence crosses over a program point. 554 class MemoryInstructionDependences { 555 typedef MemoryDepChecker::Dependence Dependence; 556 557 public: 558 struct Entry { 559 Instruction *Inst; 560 unsigned NumUnsafeDependencesStartOrEnd; 561 562 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 563 }; 564 565 typedef SmallVector<Entry, 8> AccessesType; 566 567 AccessesType::const_iterator begin() const { return Accesses.begin(); } 568 AccessesType::const_iterator end() const { return Accesses.end(); } 569 570 MemoryInstructionDependences( 571 const SmallVectorImpl<Instruction *> &Instructions, 572 const SmallVectorImpl<Dependence> &Dependences) { 573 Accesses.append(Instructions.begin(), Instructions.end()); 574 575 DEBUG(dbgs() << "Backward dependences:\n"); 576 for (auto &Dep : Dependences) 577 if (Dep.isPossiblyBackward()) { 578 // Note that the designations source and destination follow the program 579 // order, i.e. source is always first. (The direction is given by the 580 // DepType.) 581 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 582 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 583 584 DEBUG(Dep.print(dbgs(), 2, Instructions)); 585 } 586 } 587 588 private: 589 AccessesType Accesses; 590 }; 591 592 /// \brief The actual class performing the per-loop work. 593 class LoopDistributeForLoop { 594 public: 595 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 596 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 597 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { 598 setForced(); 599 } 600 601 /// \brief Try to distribute an inner-most loop. 602 bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 603 assert(L->empty() && "Only process inner loops."); 604 605 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 606 << "\" checking " << *L << "\n"); 607 608 if (!L->getExitBlock()) 609 return fail("MultipleExitBlocks", "multiple exit blocks"); 610 if (!L->isLoopSimplifyForm()) 611 return fail("NotLoopSimplifyForm", 612 "loop is not in loop-simplify form"); 613 614 BasicBlock *PH = L->getLoopPreheader(); 615 616 // LAA will check that we only have a single exiting block. 617 LAI = &GetLAA(*L); 618 619 // Currently, we only distribute to isolate the part of the loop with 620 // dependence cycles to enable partial vectorization. 621 if (LAI->canVectorizeMemory()) 622 return fail("MemOpsCanBeVectorized", 623 "memory operations are safe for vectorization"); 624 625 auto *Dependences = LAI->getDepChecker().getDependences(); 626 if (!Dependences || Dependences->empty()) 627 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 628 629 InstPartitionContainer Partitions(L, LI, DT); 630 631 // First, go through each memory operation and assign them to consecutive 632 // partitions (the order of partitions follows program order). Put those 633 // with unsafe dependences into "cyclic" partition otherwise put each store 634 // in its own "non-cyclic" partition (we'll merge these later). 635 // 636 // Note that a memory operation (e.g. Load2 below) at a program point that 637 // has an unsafe dependence (Store3->Load1) spanning over it must be 638 // included in the same cyclic partition as the dependent operations. This 639 // is to preserve the original program order after distribution. E.g.: 640 // 641 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 642 // Load1 -. 1 0->1 643 // Load2 | /Unsafe/ 0 1 644 // Store3 -' -1 1->0 645 // Load4 0 0 646 // 647 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 648 // we just keep assigning to the same cyclic partition until 649 // NumUnsafeDependencesActive reaches 0. 650 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 651 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 652 *Dependences); 653 654 int NumUnsafeDependencesActive = 0; 655 for (auto &InstDep : MID) { 656 Instruction *I = InstDep.Inst; 657 // We update NumUnsafeDependencesActive post-instruction, catch the 658 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 659 if (NumUnsafeDependencesActive || 660 InstDep.NumUnsafeDependencesStartOrEnd > 0) 661 Partitions.addToCyclicPartition(I); 662 else 663 Partitions.addToNewNonCyclicPartition(I); 664 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 665 assert(NumUnsafeDependencesActive >= 0 && 666 "Negative number of dependences active"); 667 } 668 669 // Add partitions for values used outside. These partitions can be out of 670 // order from the original program order. This is OK because if the 671 // partition uses a load we will merge this partition with the original 672 // partition of the load that we set up in the previous loop (see 673 // mergeToAvoidDuplicatedLoads). 674 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 675 for (auto *Inst : DefsUsedOutside) 676 Partitions.addToNewNonCyclicPartition(Inst); 677 678 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 679 if (Partitions.getSize() < 2) 680 return fail("CantIsolateUnsafeDeps", 681 "cannot isolate unsafe dependencies"); 682 683 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 684 // should be able to vectorize these together. 685 Partitions.mergeBeforePopulating(); 686 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 687 if (Partitions.getSize() < 2) 688 return fail("CantIsolateUnsafeDeps", 689 "cannot isolate unsafe dependencies"); 690 691 // Now, populate the partitions with non-memory operations. 692 Partitions.populateUsedSet(); 693 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 694 695 // In order to preserve original lexical order for loads, keep them in the 696 // partition that we set up in the MemoryInstructionDependences loop. 697 if (Partitions.mergeToAvoidDuplicatedLoads()) { 698 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 699 << Partitions); 700 if (Partitions.getSize() < 2) 701 return fail("CantIsolateUnsafeDeps", 702 "cannot isolate unsafe dependencies"); 703 } 704 705 // Don't distribute the loop if we need too many SCEV run-time checks. 706 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 707 if (Pred.getComplexity() > (IsForced.getValueOr(false) 708 ? PragmaDistributeSCEVCheckThreshold 709 : DistributeSCEVCheckThreshold)) 710 return fail("TooManySCEVRuntimeChecks", 711 "too many SCEV run-time checks needed.\n"); 712 713 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 714 // We're done forming the partitions set up the reverse mapping from 715 // instructions to partitions. 716 Partitions.setupPartitionIdOnInstructions(); 717 718 // To keep things simple have an empty preheader before we version or clone 719 // the loop. (Also split if this has no predecessor, i.e. entry, because we 720 // rely on PH having a predecessor.) 721 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 722 SplitBlock(PH, PH->getTerminator(), DT, LI); 723 724 // If we need run-time checks, version the loop now. 725 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 726 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 727 const auto &AllChecks = RtPtrChecking->getChecks(); 728 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 729 RtPtrChecking); 730 731 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 732 DEBUG(dbgs() << "\nPointers:\n"); 733 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 734 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 735 LVer.setAliasChecks(std::move(Checks)); 736 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 737 LVer.versionLoop(DefsUsedOutside); 738 LVer.annotateLoopWithNoAlias(); 739 } 740 741 // Create identical copies of the original loop for each partition and hook 742 // them up sequentially. 743 Partitions.cloneLoops(); 744 745 // Now, we remove the instruction from each loop that don't belong to that 746 // partition. 747 Partitions.removeUnusedInsts(); 748 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 749 DEBUG(Partitions.printBlocks()); 750 751 if (LDistVerify) { 752 LI->verify(*DT); 753 DT->verifyDomTree(); 754 } 755 756 ++NumLoopsDistributed; 757 // Report the success. 758 ORE->emit([&]() { 759 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 760 L->getHeader()) 761 << "distributed loop"; 762 }); 763 return true; 764 } 765 766 /// \brief Provide diagnostics then \return with false. 767 bool fail(StringRef RemarkName, StringRef Message) { 768 LLVMContext &Ctx = F->getContext(); 769 bool Forced = isForced().getValueOr(false); 770 771 DEBUG(dbgs() << "Skipping; " << Message << "\n"); 772 773 // With Rpass-missed report that distribution failed. 774 ORE->emit([&]() { 775 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 776 L->getStartLoc(), L->getHeader()) 777 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 778 "more " 779 "info"; 780 }); 781 782 // With Rpass-analysis report why. This is on by default if distribution 783 // was requested explicitly. 784 ORE->emit(OptimizationRemarkAnalysis( 785 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 786 RemarkName, L->getStartLoc(), L->getHeader()) 787 << "loop not distributed: " << Message); 788 789 // Also issue a warning if distribution was requested explicitly but it 790 // failed. 791 if (Forced) 792 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 793 *F, L->getStartLoc(), "loop not distributed: failed " 794 "explicitly specified loop distribution")); 795 796 return false; 797 } 798 799 /// \brief Return if distribution forced to be enabled/disabled for the loop. 800 /// 801 /// If the optional has a value, it indicates whether distribution was forced 802 /// to be enabled (true) or disabled (false). If the optional has no value 803 /// distribution was not forced either way. 804 const Optional<bool> &isForced() const { return IsForced; } 805 806 private: 807 /// \brief Filter out checks between pointers from the same partition. 808 /// 809 /// \p PtrToPartition contains the partition number for pointers. Partition 810 /// number -1 means that the pointer is used in multiple partitions. In this 811 /// case we can't safely omit the check. 812 SmallVector<RuntimePointerChecking::PointerCheck, 4> 813 includeOnlyCrossPartitionChecks( 814 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 815 const SmallVectorImpl<int> &PtrToPartition, 816 const RuntimePointerChecking *RtPtrChecking) { 817 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 818 819 copy_if(AllChecks, std::back_inserter(Checks), 820 [&](const RuntimePointerChecking::PointerCheck &Check) { 821 for (unsigned PtrIdx1 : Check.first->Members) 822 for (unsigned PtrIdx2 : Check.second->Members) 823 // Only include this check if there is a pair of pointers 824 // that require checking and the pointers fall into 825 // separate partitions. 826 // 827 // (Note that we already know at this point that the two 828 // pointer groups need checking but it doesn't follow 829 // that each pair of pointers within the two groups need 830 // checking as well. 831 // 832 // In other words we don't want to include a check just 833 // because there is a pair of pointers between the two 834 // pointer groups that require checks and a different 835 // pair whose pointers fall into different partitions.) 836 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 837 !RuntimePointerChecking::arePointersInSamePartition( 838 PtrToPartition, PtrIdx1, PtrIdx2)) 839 return true; 840 return false; 841 }); 842 843 return Checks; 844 } 845 846 /// \brief Check whether the loop metadata is forcing distribution to be 847 /// enabled/disabled. 848 void setForced() { 849 Optional<const MDOperand *> Value = 850 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 851 if (!Value) 852 return; 853 854 const MDOperand *Op = *Value; 855 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 856 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 857 } 858 859 Loop *L; 860 Function *F; 861 862 // Analyses used. 863 LoopInfo *LI; 864 const LoopAccessInfo *LAI; 865 DominatorTree *DT; 866 ScalarEvolution *SE; 867 OptimizationRemarkEmitter *ORE; 868 869 /// \brief Indicates whether distribution is forced to be enabled/disabled for 870 /// the loop. 871 /// 872 /// If the optional has a value, it indicates whether distribution was forced 873 /// to be enabled (true) or disabled (false). If the optional has no value 874 /// distribution was not forced either way. 875 Optional<bool> IsForced; 876 }; 877 878 /// Shared implementation between new and old PMs. 879 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 880 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 881 std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 882 // Build up a worklist of inner-loops to vectorize. This is necessary as the 883 // act of distributing a loop creates new loops and can invalidate iterators 884 // across the loops. 885 SmallVector<Loop *, 8> Worklist; 886 887 for (Loop *TopLevelLoop : *LI) 888 for (Loop *L : depth_first(TopLevelLoop)) 889 // We only handle inner-most loops. 890 if (L->empty()) 891 Worklist.push_back(L); 892 893 // Now walk the identified inner loops. 894 bool Changed = false; 895 for (Loop *L : Worklist) { 896 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 897 898 // If distribution was forced for the specific loop to be 899 // enabled/disabled, follow that. Otherwise use the global flag. 900 if (LDL.isForced().getValueOr(EnableLoopDistribute)) 901 Changed |= LDL.processLoop(GetLAA); 902 } 903 904 // Process each loop nest in the function. 905 return Changed; 906 } 907 908 /// \brief The pass class. 909 class LoopDistributeLegacy : public FunctionPass { 910 public: 911 LoopDistributeLegacy() : FunctionPass(ID) { 912 // The default is set by the caller. 913 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 914 } 915 916 bool runOnFunction(Function &F) override { 917 if (skipFunction(F)) 918 return false; 919 920 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 921 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 922 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 923 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 924 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 925 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 926 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 927 928 return runImpl(F, LI, DT, SE, ORE, GetLAA); 929 } 930 931 void getAnalysisUsage(AnalysisUsage &AU) const override { 932 AU.addRequired<ScalarEvolutionWrapperPass>(); 933 AU.addRequired<LoopInfoWrapperPass>(); 934 AU.addPreserved<LoopInfoWrapperPass>(); 935 AU.addRequired<LoopAccessLegacyAnalysis>(); 936 AU.addRequired<DominatorTreeWrapperPass>(); 937 AU.addPreserved<DominatorTreeWrapperPass>(); 938 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 939 AU.addPreserved<GlobalsAAWrapperPass>(); 940 } 941 942 static char ID; 943 }; 944 } // anonymous namespace 945 946 PreservedAnalyses LoopDistributePass::run(Function &F, 947 FunctionAnalysisManager &AM) { 948 auto &LI = AM.getResult<LoopAnalysis>(F); 949 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 950 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 951 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 952 953 // We don't directly need these analyses but they're required for loop 954 // analyses so provide them below. 955 auto &AA = AM.getResult<AAManager>(F); 956 auto &AC = AM.getResult<AssumptionAnalysis>(F); 957 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 958 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 959 960 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 961 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 962 [&](Loop &L) -> const LoopAccessInfo & { 963 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; 964 return LAM.getResult<LoopAccessAnalysis>(L, AR); 965 }; 966 967 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); 968 if (!Changed) 969 return PreservedAnalyses::all(); 970 PreservedAnalyses PA; 971 PA.preserve<LoopAnalysis>(); 972 PA.preserve<DominatorTreeAnalysis>(); 973 PA.preserve<GlobalsAA>(); 974 return PA; 975 } 976 977 char LoopDistributeLegacy::ID; 978 static const char ldist_name[] = "Loop Distribution"; 979 980 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 981 false) 982 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 983 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 984 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 985 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 986 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 987 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 988 989 namespace llvm { 990 FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); } 991 } 992