1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/Transforms/Scalar/LoopDistribute.h" 26 #include "llvm/ADT/DenseMap.h" 27 #include "llvm/ADT/DepthFirstIterator.h" 28 #include "llvm/ADT/EquivalenceClasses.h" 29 #include "llvm/ADT/Optional.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/SmallPtrSet.h" 32 #include "llvm/ADT/SmallVector.h" 33 #include "llvm/ADT/Statistic.h" 34 #include "llvm/ADT/StringRef.h" 35 #include "llvm/ADT/Twine.h" 36 #include "llvm/ADT/iterator_range.h" 37 #include "llvm/Analysis/AliasAnalysis.h" 38 #include "llvm/Analysis/AssumptionCache.h" 39 #include "llvm/Analysis/GlobalsModRef.h" 40 #include "llvm/Analysis/LoopAccessAnalysis.h" 41 #include "llvm/Analysis/LoopAnalysisManager.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 44 #include "llvm/Analysis/ScalarEvolution.h" 45 #include "llvm/Analysis/TargetLibraryInfo.h" 46 #include "llvm/Analysis/TargetTransformInfo.h" 47 #include "llvm/IR/BasicBlock.h" 48 #include "llvm/IR/Constants.h" 49 #include "llvm/IR/DiagnosticInfo.h" 50 #include "llvm/IR/Dominators.h" 51 #include "llvm/IR/Function.h" 52 #include "llvm/IR/InstrTypes.h" 53 #include "llvm/IR/Instruction.h" 54 #include "llvm/IR/Instructions.h" 55 #include "llvm/IR/LLVMContext.h" 56 #include "llvm/IR/Metadata.h" 57 #include "llvm/IR/PassManager.h" 58 #include "llvm/IR/Value.h" 59 #include "llvm/Pass.h" 60 #include "llvm/Support/Casting.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "llvm/Support/Debug.h" 63 #include "llvm/Support/raw_ostream.h" 64 #include "llvm/Transforms/Scalar.h" 65 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 66 #include "llvm/Transforms/Utils/Cloning.h" 67 #include "llvm/Transforms/Utils/LoopUtils.h" 68 #include "llvm/Transforms/Utils/LoopVersioning.h" 69 #include "llvm/Transforms/Utils/ValueMapper.h" 70 #include <cassert> 71 #include <functional> 72 #include <list> 73 #include <tuple> 74 #include <utility> 75 76 using namespace llvm; 77 78 #define LDIST_NAME "loop-distribute" 79 #define DEBUG_TYPE LDIST_NAME 80 81 static cl::opt<bool> 82 LDistVerify("loop-distribute-verify", cl::Hidden, 83 cl::desc("Turn on DominatorTree and LoopInfo verification " 84 "after Loop Distribution"), 85 cl::init(false)); 86 87 static cl::opt<bool> DistributeNonIfConvertible( 88 "loop-distribute-non-if-convertible", cl::Hidden, 89 cl::desc("Whether to distribute into a loop that may not be " 90 "if-convertible by the loop vectorizer"), 91 cl::init(false)); 92 93 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 94 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 95 cl::desc("The maximum number of SCEV checks allowed for Loop " 96 "Distribution")); 97 98 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 99 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 100 cl::Hidden, 101 cl::desc( 102 "The maximum number of SCEV checks allowed for Loop " 103 "Distribution for loop marked with #pragma loop distribute(enable)")); 104 105 static cl::opt<bool> EnableLoopDistribute( 106 "enable-loop-distribute", cl::Hidden, 107 cl::desc("Enable the new, experimental LoopDistribution Pass"), 108 cl::init(false)); 109 110 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 111 112 namespace { 113 114 /// \brief Maintains the set of instructions of the loop for a partition before 115 /// cloning. After cloning, it hosts the new loop. 116 class InstPartition { 117 using InstructionSet = SmallPtrSet<Instruction *, 8>; 118 119 public: 120 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 121 : DepCycle(DepCycle), OrigLoop(L) { 122 Set.insert(I); 123 } 124 125 /// \brief Returns whether this partition contains a dependence cycle. 126 bool hasDepCycle() const { return DepCycle; } 127 128 /// \brief Adds an instruction to this partition. 129 void add(Instruction *I) { Set.insert(I); } 130 131 /// \brief Collection accessors. 132 InstructionSet::iterator begin() { return Set.begin(); } 133 InstructionSet::iterator end() { return Set.end(); } 134 InstructionSet::const_iterator begin() const { return Set.begin(); } 135 InstructionSet::const_iterator end() const { return Set.end(); } 136 bool empty() const { return Set.empty(); } 137 138 /// \brief Moves this partition into \p Other. This partition becomes empty 139 /// after this. 140 void moveTo(InstPartition &Other) { 141 Other.Set.insert(Set.begin(), Set.end()); 142 Set.clear(); 143 Other.DepCycle |= DepCycle; 144 } 145 146 /// \brief Populates the partition with a transitive closure of all the 147 /// instructions that the seeded instructions dependent on. 148 void populateUsedSet() { 149 // FIXME: We currently don't use control-dependence but simply include all 150 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 151 // up. 152 for (auto *B : OrigLoop->getBlocks()) 153 Set.insert(B->getTerminator()); 154 155 // Follow the use-def chains to form a transitive closure of all the 156 // instructions that the originally seeded instructions depend on. 157 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 158 while (!Worklist.empty()) { 159 Instruction *I = Worklist.pop_back_val(); 160 // Insert instructions from the loop that we depend on. 161 for (Value *V : I->operand_values()) { 162 auto *I = dyn_cast<Instruction>(V); 163 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 164 Worklist.push_back(I); 165 } 166 } 167 } 168 169 /// \brief Clones the original loop. 170 /// 171 /// Updates LoopInfo and DominatorTree using the information that block \p 172 /// LoopDomBB dominates the loop. 173 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 174 unsigned Index, LoopInfo *LI, 175 DominatorTree *DT) { 176 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 177 VMap, Twine(".ldist") + Twine(Index), 178 LI, DT, ClonedLoopBlocks); 179 return ClonedLoop; 180 } 181 182 /// \brief The cloned loop. If this partition is mapped to the original loop, 183 /// this is null. 184 const Loop *getClonedLoop() const { return ClonedLoop; } 185 186 /// \brief Returns the loop where this partition ends up after distribution. 187 /// If this partition is mapped to the original loop then use the block from 188 /// the loop. 189 const Loop *getDistributedLoop() const { 190 return ClonedLoop ? ClonedLoop : OrigLoop; 191 } 192 193 /// \brief The VMap that is populated by cloning and then used in 194 /// remapinstruction to remap the cloned instructions. 195 ValueToValueMapTy &getVMap() { return VMap; } 196 197 /// \brief Remaps the cloned instructions using VMap. 198 void remapInstructions() { 199 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 200 } 201 202 /// \brief Based on the set of instructions selected for this partition, 203 /// removes the unnecessary ones. 204 void removeUnusedInsts() { 205 SmallVector<Instruction *, 8> Unused; 206 207 for (auto *Block : OrigLoop->getBlocks()) 208 for (auto &Inst : *Block) 209 if (!Set.count(&Inst)) { 210 Instruction *NewInst = &Inst; 211 if (!VMap.empty()) 212 NewInst = cast<Instruction>(VMap[NewInst]); 213 214 assert(!isa<BranchInst>(NewInst) && 215 "Branches are marked used early on"); 216 Unused.push_back(NewInst); 217 } 218 219 // Delete the instructions backwards, as it has a reduced likelihood of 220 // having to update as many def-use and use-def chains. 221 for (auto *Inst : reverse(Unused)) { 222 if (!Inst->use_empty()) 223 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 224 Inst->eraseFromParent(); 225 } 226 } 227 228 void print() const { 229 if (DepCycle) 230 dbgs() << " (cycle)\n"; 231 for (auto *I : Set) 232 // Prefix with the block name. 233 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 234 } 235 236 void printBlocks() const { 237 for (auto *BB : getDistributedLoop()->getBlocks()) 238 dbgs() << *BB; 239 } 240 241 private: 242 /// \brief Instructions from OrigLoop selected for this partition. 243 InstructionSet Set; 244 245 /// \brief Whether this partition contains a dependence cycle. 246 bool DepCycle; 247 248 /// \brief The original loop. 249 Loop *OrigLoop; 250 251 /// \brief The cloned loop. If this partition is mapped to the original loop, 252 /// this is null. 253 Loop *ClonedLoop = nullptr; 254 255 /// \brief The blocks of ClonedLoop including the preheader. If this 256 /// partition is mapped to the original loop, this is empty. 257 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 258 259 /// \brief These gets populated once the set of instructions have been 260 /// finalized. If this partition is mapped to the original loop, these are not 261 /// set. 262 ValueToValueMapTy VMap; 263 }; 264 265 /// \brief Holds the set of Partitions. It populates them, merges them and then 266 /// clones the loops. 267 class InstPartitionContainer { 268 using InstToPartitionIdT = DenseMap<Instruction *, int>; 269 270 public: 271 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 272 : L(L), LI(LI), DT(DT) {} 273 274 /// \brief Returns the number of partitions. 275 unsigned getSize() const { return PartitionContainer.size(); } 276 277 /// \brief Adds \p Inst into the current partition if that is marked to 278 /// contain cycles. Otherwise start a new partition for it. 279 void addToCyclicPartition(Instruction *Inst) { 280 // If the current partition is non-cyclic. Start a new one. 281 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 282 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 283 else 284 PartitionContainer.back().add(Inst); 285 } 286 287 /// \brief Adds \p Inst into a partition that is not marked to contain 288 /// dependence cycles. 289 /// 290 // Initially we isolate memory instructions into as many partitions as 291 // possible, then later we may merge them back together. 292 void addToNewNonCyclicPartition(Instruction *Inst) { 293 PartitionContainer.emplace_back(Inst, L); 294 } 295 296 /// \brief Merges adjacent non-cyclic partitions. 297 /// 298 /// The idea is that we currently only want to isolate the non-vectorizable 299 /// partition. We could later allow more distribution among these partition 300 /// too. 301 void mergeAdjacentNonCyclic() { 302 mergeAdjacentPartitionsIf( 303 [](const InstPartition *P) { return !P->hasDepCycle(); }); 304 } 305 306 /// \brief If a partition contains only conditional stores, we won't vectorize 307 /// it. Try to merge it with a previous cyclic partition. 308 void mergeNonIfConvertible() { 309 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 310 if (Partition->hasDepCycle()) 311 return true; 312 313 // Now, check if all stores are conditional in this partition. 314 bool seenStore = false; 315 316 for (auto *Inst : *Partition) 317 if (isa<StoreInst>(Inst)) { 318 seenStore = true; 319 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 320 return false; 321 } 322 return seenStore; 323 }); 324 } 325 326 /// \brief Merges the partitions according to various heuristics. 327 void mergeBeforePopulating() { 328 mergeAdjacentNonCyclic(); 329 if (!DistributeNonIfConvertible) 330 mergeNonIfConvertible(); 331 } 332 333 /// \brief Merges partitions in order to ensure that no loads are duplicated. 334 /// 335 /// We can't duplicate loads because that could potentially reorder them. 336 /// LoopAccessAnalysis provides dependency information with the context that 337 /// the order of memory operation is preserved. 338 /// 339 /// Return if any partitions were merged. 340 bool mergeToAvoidDuplicatedLoads() { 341 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 342 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 343 344 LoadToPartitionT LoadToPartition; 345 ToBeMergedT ToBeMerged; 346 347 // Step through the partitions and create equivalence between partitions 348 // that contain the same load. Also put partitions in between them in the 349 // same equivalence class to avoid reordering of memory operations. 350 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 351 E = PartitionContainer.end(); 352 I != E; ++I) { 353 auto *PartI = &*I; 354 355 // If a load occurs in two partitions PartI and PartJ, merge all 356 // partitions (PartI, PartJ] into PartI. 357 for (Instruction *Inst : *PartI) 358 if (isa<LoadInst>(Inst)) { 359 bool NewElt; 360 LoadToPartitionT::iterator LoadToPart; 361 362 std::tie(LoadToPart, NewElt) = 363 LoadToPartition.insert(std::make_pair(Inst, PartI)); 364 if (!NewElt) { 365 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 366 << "partitions: " << PartI << ", " 367 << LoadToPart->second << "\n" << *Inst << "\n"); 368 369 auto PartJ = I; 370 do { 371 --PartJ; 372 ToBeMerged.unionSets(PartI, &*PartJ); 373 } while (&*PartJ != LoadToPart->second); 374 } 375 } 376 } 377 if (ToBeMerged.empty()) 378 return false; 379 380 // Merge the member of an equivalence class into its class leader. This 381 // makes the members empty. 382 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 383 I != E; ++I) { 384 if (!I->isLeader()) 385 continue; 386 387 auto PartI = I->getData(); 388 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 389 ToBeMerged.member_end())) { 390 PartJ->moveTo(*PartI); 391 } 392 } 393 394 // Remove the empty partitions. 395 PartitionContainer.remove_if( 396 [](const InstPartition &P) { return P.empty(); }); 397 398 return true; 399 } 400 401 /// \brief Sets up the mapping between instructions to partitions. If the 402 /// instruction is duplicated across multiple partitions, set the entry to -1. 403 void setupPartitionIdOnInstructions() { 404 int PartitionID = 0; 405 for (const auto &Partition : PartitionContainer) { 406 for (Instruction *Inst : Partition) { 407 bool NewElt; 408 InstToPartitionIdT::iterator Iter; 409 410 std::tie(Iter, NewElt) = 411 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 412 if (!NewElt) 413 Iter->second = -1; 414 } 415 ++PartitionID; 416 } 417 } 418 419 /// \brief Populates the partition with everything that the seeding 420 /// instructions require. 421 void populateUsedSet() { 422 for (auto &P : PartitionContainer) 423 P.populateUsedSet(); 424 } 425 426 /// \brief This performs the main chunk of the work of cloning the loops for 427 /// the partitions. 428 void cloneLoops() { 429 BasicBlock *OrigPH = L->getLoopPreheader(); 430 // At this point the predecessor of the preheader is either the memcheck 431 // block or the top part of the original preheader. 432 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 433 assert(Pred && "Preheader does not have a single predecessor"); 434 BasicBlock *ExitBlock = L->getExitBlock(); 435 assert(ExitBlock && "No single exit block"); 436 Loop *NewLoop; 437 438 assert(!PartitionContainer.empty() && "at least two partitions expected"); 439 // We're cloning the preheader along with the loop so we already made sure 440 // it was empty. 441 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 442 "preheader not empty"); 443 444 // Create a loop for each partition except the last. Clone the original 445 // loop before PH along with adding a preheader for the cloned loop. Then 446 // update PH to point to the newly added preheader. 447 BasicBlock *TopPH = OrigPH; 448 unsigned Index = getSize() - 1; 449 for (auto I = std::next(PartitionContainer.rbegin()), 450 E = PartitionContainer.rend(); 451 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 452 auto *Part = &*I; 453 454 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 455 456 Part->getVMap()[ExitBlock] = TopPH; 457 Part->remapInstructions(); 458 } 459 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 460 461 // Now go in forward order and update the immediate dominator for the 462 // preheaders with the exiting block of the previous loop. Dominance 463 // within the loop is updated in cloneLoopWithPreheader. 464 for (auto Curr = PartitionContainer.cbegin(), 465 Next = std::next(PartitionContainer.cbegin()), 466 E = PartitionContainer.cend(); 467 Next != E; ++Curr, ++Next) 468 DT->changeImmediateDominator( 469 Next->getDistributedLoop()->getLoopPreheader(), 470 Curr->getDistributedLoop()->getExitingBlock()); 471 } 472 473 /// \brief Removes the dead instructions from the cloned loops. 474 void removeUnusedInsts() { 475 for (auto &Partition : PartitionContainer) 476 Partition.removeUnusedInsts(); 477 } 478 479 /// \brief For each memory pointer, it computes the partitionId the pointer is 480 /// used in. 481 /// 482 /// This returns an array of int where the I-th entry corresponds to I-th 483 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 484 /// partitions its entry is set to -1. 485 SmallVector<int, 8> 486 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 487 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 488 489 unsigned N = RtPtrCheck->Pointers.size(); 490 SmallVector<int, 8> PtrToPartitions(N); 491 for (unsigned I = 0; I < N; ++I) { 492 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 493 auto Instructions = 494 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 495 496 int &Partition = PtrToPartitions[I]; 497 // First set it to uninitialized. 498 Partition = -2; 499 for (Instruction *Inst : Instructions) { 500 // Note that this could be -1 if Inst is duplicated across multiple 501 // partitions. 502 int ThisPartition = this->InstToPartitionId[Inst]; 503 if (Partition == -2) 504 Partition = ThisPartition; 505 // -1 means belonging to multiple partitions. 506 else if (Partition == -1) 507 break; 508 else if (Partition != (int)ThisPartition) 509 Partition = -1; 510 } 511 assert(Partition != -2 && "Pointer not belonging to any partition"); 512 } 513 514 return PtrToPartitions; 515 } 516 517 void print(raw_ostream &OS) const { 518 unsigned Index = 0; 519 for (const auto &P : PartitionContainer) { 520 OS << "Partition " << Index++ << " (" << &P << "):\n"; 521 P.print(); 522 } 523 } 524 525 void dump() const { print(dbgs()); } 526 527 #ifndef NDEBUG 528 friend raw_ostream &operator<<(raw_ostream &OS, 529 const InstPartitionContainer &Partitions) { 530 Partitions.print(OS); 531 return OS; 532 } 533 #endif 534 535 void printBlocks() const { 536 unsigned Index = 0; 537 for (const auto &P : PartitionContainer) { 538 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 539 P.printBlocks(); 540 } 541 } 542 543 private: 544 using PartitionContainerT = std::list<InstPartition>; 545 546 /// \brief List of partitions. 547 PartitionContainerT PartitionContainer; 548 549 /// \brief Mapping from Instruction to partition Id. If the instruction 550 /// belongs to multiple partitions the entry contains -1. 551 InstToPartitionIdT InstToPartitionId; 552 553 Loop *L; 554 LoopInfo *LI; 555 DominatorTree *DT; 556 557 /// \brief The control structure to merge adjacent partitions if both satisfy 558 /// the \p Predicate. 559 template <class UnaryPredicate> 560 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 561 InstPartition *PrevMatch = nullptr; 562 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 563 auto DoesMatch = Predicate(&*I); 564 if (PrevMatch == nullptr && DoesMatch) { 565 PrevMatch = &*I; 566 ++I; 567 } else if (PrevMatch != nullptr && DoesMatch) { 568 I->moveTo(*PrevMatch); 569 I = PartitionContainer.erase(I); 570 } else { 571 PrevMatch = nullptr; 572 ++I; 573 } 574 } 575 } 576 }; 577 578 /// \brief For each memory instruction, this class maintains difference of the 579 /// number of unsafe dependences that start out from this instruction minus 580 /// those that end here. 581 /// 582 /// By traversing the memory instructions in program order and accumulating this 583 /// number, we know whether any unsafe dependence crosses over a program point. 584 class MemoryInstructionDependences { 585 using Dependence = MemoryDepChecker::Dependence; 586 587 public: 588 struct Entry { 589 Instruction *Inst; 590 unsigned NumUnsafeDependencesStartOrEnd = 0; 591 592 Entry(Instruction *Inst) : Inst(Inst) {} 593 }; 594 595 using AccessesType = SmallVector<Entry, 8>; 596 597 AccessesType::const_iterator begin() const { return Accesses.begin(); } 598 AccessesType::const_iterator end() const { return Accesses.end(); } 599 600 MemoryInstructionDependences( 601 const SmallVectorImpl<Instruction *> &Instructions, 602 const SmallVectorImpl<Dependence> &Dependences) { 603 Accesses.append(Instructions.begin(), Instructions.end()); 604 605 DEBUG(dbgs() << "Backward dependences:\n"); 606 for (auto &Dep : Dependences) 607 if (Dep.isPossiblyBackward()) { 608 // Note that the designations source and destination follow the program 609 // order, i.e. source is always first. (The direction is given by the 610 // DepType.) 611 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 612 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 613 614 DEBUG(Dep.print(dbgs(), 2, Instructions)); 615 } 616 } 617 618 private: 619 AccessesType Accesses; 620 }; 621 622 /// \brief The actual class performing the per-loop work. 623 class LoopDistributeForLoop { 624 public: 625 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 626 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) 627 : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) { 628 setForced(); 629 } 630 631 /// \brief Try to distribute an inner-most loop. 632 bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 633 assert(L->empty() && "Only process inner loops."); 634 635 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 636 << "\" checking " << *L << "\n"); 637 638 if (!L->getExitBlock()) 639 return fail("MultipleExitBlocks", "multiple exit blocks"); 640 if (!L->isLoopSimplifyForm()) 641 return fail("NotLoopSimplifyForm", 642 "loop is not in loop-simplify form"); 643 644 BasicBlock *PH = L->getLoopPreheader(); 645 646 // LAA will check that we only have a single exiting block. 647 LAI = &GetLAA(*L); 648 649 // Currently, we only distribute to isolate the part of the loop with 650 // dependence cycles to enable partial vectorization. 651 if (LAI->canVectorizeMemory()) 652 return fail("MemOpsCanBeVectorized", 653 "memory operations are safe for vectorization"); 654 655 auto *Dependences = LAI->getDepChecker().getDependences(); 656 if (!Dependences || Dependences->empty()) 657 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 658 659 InstPartitionContainer Partitions(L, LI, DT); 660 661 // First, go through each memory operation and assign them to consecutive 662 // partitions (the order of partitions follows program order). Put those 663 // with unsafe dependences into "cyclic" partition otherwise put each store 664 // in its own "non-cyclic" partition (we'll merge these later). 665 // 666 // Note that a memory operation (e.g. Load2 below) at a program point that 667 // has an unsafe dependence (Store3->Load1) spanning over it must be 668 // included in the same cyclic partition as the dependent operations. This 669 // is to preserve the original program order after distribution. E.g.: 670 // 671 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 672 // Load1 -. 1 0->1 673 // Load2 | /Unsafe/ 0 1 674 // Store3 -' -1 1->0 675 // Load4 0 0 676 // 677 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 678 // we just keep assigning to the same cyclic partition until 679 // NumUnsafeDependencesActive reaches 0. 680 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 681 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 682 *Dependences); 683 684 int NumUnsafeDependencesActive = 0; 685 for (auto &InstDep : MID) { 686 Instruction *I = InstDep.Inst; 687 // We update NumUnsafeDependencesActive post-instruction, catch the 688 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 689 if (NumUnsafeDependencesActive || 690 InstDep.NumUnsafeDependencesStartOrEnd > 0) 691 Partitions.addToCyclicPartition(I); 692 else 693 Partitions.addToNewNonCyclicPartition(I); 694 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 695 assert(NumUnsafeDependencesActive >= 0 && 696 "Negative number of dependences active"); 697 } 698 699 // Add partitions for values used outside. These partitions can be out of 700 // order from the original program order. This is OK because if the 701 // partition uses a load we will merge this partition with the original 702 // partition of the load that we set up in the previous loop (see 703 // mergeToAvoidDuplicatedLoads). 704 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 705 for (auto *Inst : DefsUsedOutside) 706 Partitions.addToNewNonCyclicPartition(Inst); 707 708 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 709 if (Partitions.getSize() < 2) 710 return fail("CantIsolateUnsafeDeps", 711 "cannot isolate unsafe dependencies"); 712 713 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 714 // should be able to vectorize these together. 715 Partitions.mergeBeforePopulating(); 716 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 717 if (Partitions.getSize() < 2) 718 return fail("CantIsolateUnsafeDeps", 719 "cannot isolate unsafe dependencies"); 720 721 // Now, populate the partitions with non-memory operations. 722 Partitions.populateUsedSet(); 723 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 724 725 // In order to preserve original lexical order for loads, keep them in the 726 // partition that we set up in the MemoryInstructionDependences loop. 727 if (Partitions.mergeToAvoidDuplicatedLoads()) { 728 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 729 << Partitions); 730 if (Partitions.getSize() < 2) 731 return fail("CantIsolateUnsafeDeps", 732 "cannot isolate unsafe dependencies"); 733 } 734 735 // Don't distribute the loop if we need too many SCEV run-time checks. 736 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 737 if (Pred.getComplexity() > (IsForced.getValueOr(false) 738 ? PragmaDistributeSCEVCheckThreshold 739 : DistributeSCEVCheckThreshold)) 740 return fail("TooManySCEVRuntimeChecks", 741 "too many SCEV run-time checks needed.\n"); 742 743 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 744 // We're done forming the partitions set up the reverse mapping from 745 // instructions to partitions. 746 Partitions.setupPartitionIdOnInstructions(); 747 748 // To keep things simple have an empty preheader before we version or clone 749 // the loop. (Also split if this has no predecessor, i.e. entry, because we 750 // rely on PH having a predecessor.) 751 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 752 SplitBlock(PH, PH->getTerminator(), DT, LI); 753 754 // If we need run-time checks, version the loop now. 755 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 756 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 757 const auto &AllChecks = RtPtrChecking->getChecks(); 758 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 759 RtPtrChecking); 760 761 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 762 DEBUG(dbgs() << "\nPointers:\n"); 763 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 764 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 765 LVer.setAliasChecks(std::move(Checks)); 766 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 767 LVer.versionLoop(DefsUsedOutside); 768 LVer.annotateLoopWithNoAlias(); 769 } 770 771 // Create identical copies of the original loop for each partition and hook 772 // them up sequentially. 773 Partitions.cloneLoops(); 774 775 // Now, we remove the instruction from each loop that don't belong to that 776 // partition. 777 Partitions.removeUnusedInsts(); 778 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 779 DEBUG(Partitions.printBlocks()); 780 781 if (LDistVerify) { 782 LI->verify(*DT); 783 DT->verifyDomTree(); 784 } 785 786 ++NumLoopsDistributed; 787 // Report the success. 788 ORE->emit([&]() { 789 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 790 L->getHeader()) 791 << "distributed loop"; 792 }); 793 return true; 794 } 795 796 /// \brief Provide diagnostics then \return with false. 797 bool fail(StringRef RemarkName, StringRef Message) { 798 LLVMContext &Ctx = F->getContext(); 799 bool Forced = isForced().getValueOr(false); 800 801 DEBUG(dbgs() << "Skipping; " << Message << "\n"); 802 803 // With Rpass-missed report that distribution failed. 804 ORE->emit([&]() { 805 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 806 L->getStartLoc(), L->getHeader()) 807 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 808 "more " 809 "info"; 810 }); 811 812 // With Rpass-analysis report why. This is on by default if distribution 813 // was requested explicitly. 814 ORE->emit(OptimizationRemarkAnalysis( 815 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 816 RemarkName, L->getStartLoc(), L->getHeader()) 817 << "loop not distributed: " << Message); 818 819 // Also issue a warning if distribution was requested explicitly but it 820 // failed. 821 if (Forced) 822 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 823 *F, L->getStartLoc(), "loop not distributed: failed " 824 "explicitly specified loop distribution")); 825 826 return false; 827 } 828 829 /// \brief Return if distribution forced to be enabled/disabled for the loop. 830 /// 831 /// If the optional has a value, it indicates whether distribution was forced 832 /// to be enabled (true) or disabled (false). If the optional has no value 833 /// distribution was not forced either way. 834 const Optional<bool> &isForced() const { return IsForced; } 835 836 private: 837 /// \brief Filter out checks between pointers from the same partition. 838 /// 839 /// \p PtrToPartition contains the partition number for pointers. Partition 840 /// number -1 means that the pointer is used in multiple partitions. In this 841 /// case we can't safely omit the check. 842 SmallVector<RuntimePointerChecking::PointerCheck, 4> 843 includeOnlyCrossPartitionChecks( 844 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 845 const SmallVectorImpl<int> &PtrToPartition, 846 const RuntimePointerChecking *RtPtrChecking) { 847 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 848 849 copy_if(AllChecks, std::back_inserter(Checks), 850 [&](const RuntimePointerChecking::PointerCheck &Check) { 851 for (unsigned PtrIdx1 : Check.first->Members) 852 for (unsigned PtrIdx2 : Check.second->Members) 853 // Only include this check if there is a pair of pointers 854 // that require checking and the pointers fall into 855 // separate partitions. 856 // 857 // (Note that we already know at this point that the two 858 // pointer groups need checking but it doesn't follow 859 // that each pair of pointers within the two groups need 860 // checking as well. 861 // 862 // In other words we don't want to include a check just 863 // because there is a pair of pointers between the two 864 // pointer groups that require checks and a different 865 // pair whose pointers fall into different partitions.) 866 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 867 !RuntimePointerChecking::arePointersInSamePartition( 868 PtrToPartition, PtrIdx1, PtrIdx2)) 869 return true; 870 return false; 871 }); 872 873 return Checks; 874 } 875 876 /// \brief Check whether the loop metadata is forcing distribution to be 877 /// enabled/disabled. 878 void setForced() { 879 Optional<const MDOperand *> Value = 880 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 881 if (!Value) 882 return; 883 884 const MDOperand *Op = *Value; 885 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 886 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 887 } 888 889 Loop *L; 890 Function *F; 891 892 // Analyses used. 893 LoopInfo *LI; 894 const LoopAccessInfo *LAI = nullptr; 895 DominatorTree *DT; 896 ScalarEvolution *SE; 897 OptimizationRemarkEmitter *ORE; 898 899 /// \brief Indicates whether distribution is forced to be enabled/disabled for 900 /// the loop. 901 /// 902 /// If the optional has a value, it indicates whether distribution was forced 903 /// to be enabled (true) or disabled (false). If the optional has no value 904 /// distribution was not forced either way. 905 Optional<bool> IsForced; 906 }; 907 908 } // end anonymous namespace 909 910 /// Shared implementation between new and old PMs. 911 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 912 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 913 std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { 914 // Build up a worklist of inner-loops to vectorize. This is necessary as the 915 // act of distributing a loop creates new loops and can invalidate iterators 916 // across the loops. 917 SmallVector<Loop *, 8> Worklist; 918 919 for (Loop *TopLevelLoop : *LI) 920 for (Loop *L : depth_first(TopLevelLoop)) 921 // We only handle inner-most loops. 922 if (L->empty()) 923 Worklist.push_back(L); 924 925 // Now walk the identified inner loops. 926 bool Changed = false; 927 for (Loop *L : Worklist) { 928 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); 929 930 // If distribution was forced for the specific loop to be 931 // enabled/disabled, follow that. Otherwise use the global flag. 932 if (LDL.isForced().getValueOr(EnableLoopDistribute)) 933 Changed |= LDL.processLoop(GetLAA); 934 } 935 936 // Process each loop nest in the function. 937 return Changed; 938 } 939 940 namespace { 941 942 /// \brief The pass class. 943 class LoopDistributeLegacy : public FunctionPass { 944 public: 945 static char ID; 946 947 LoopDistributeLegacy() : FunctionPass(ID) { 948 // The default is set by the caller. 949 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 950 } 951 952 bool runOnFunction(Function &F) override { 953 if (skipFunction(F)) 954 return false; 955 956 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 957 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 958 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 959 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 960 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 961 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 962 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 963 964 return runImpl(F, LI, DT, SE, ORE, GetLAA); 965 } 966 967 void getAnalysisUsage(AnalysisUsage &AU) const override { 968 AU.addRequired<ScalarEvolutionWrapperPass>(); 969 AU.addRequired<LoopInfoWrapperPass>(); 970 AU.addPreserved<LoopInfoWrapperPass>(); 971 AU.addRequired<LoopAccessLegacyAnalysis>(); 972 AU.addRequired<DominatorTreeWrapperPass>(); 973 AU.addPreserved<DominatorTreeWrapperPass>(); 974 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 975 AU.addPreserved<GlobalsAAWrapperPass>(); 976 } 977 }; 978 979 } // end anonymous namespace 980 981 PreservedAnalyses LoopDistributePass::run(Function &F, 982 FunctionAnalysisManager &AM) { 983 auto &LI = AM.getResult<LoopAnalysis>(F); 984 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 985 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 986 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 987 988 // We don't directly need these analyses but they're required for loop 989 // analyses so provide them below. 990 auto &AA = AM.getResult<AAManager>(F); 991 auto &AC = AM.getResult<AssumptionAnalysis>(F); 992 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 993 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 994 995 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 996 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 997 [&](Loop &L) -> const LoopAccessInfo & { 998 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr}; 999 return LAM.getResult<LoopAccessAnalysis>(L, AR); 1000 }; 1001 1002 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA); 1003 if (!Changed) 1004 return PreservedAnalyses::all(); 1005 PreservedAnalyses PA; 1006 PA.preserve<LoopAnalysis>(); 1007 PA.preserve<DominatorTreeAnalysis>(); 1008 PA.preserve<GlobalsAA>(); 1009 return PA; 1010 } 1011 1012 char LoopDistributeLegacy::ID; 1013 1014 static const char ldist_name[] = "Loop Distribution"; 1015 1016 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 1017 false) 1018 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1019 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 1020 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1021 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1022 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1023 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 1024 1025 FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 1026