1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Loop Distribution Pass. Its main focus is to 10 // distribute loops that cannot be vectorized due to dependence cycles. It 11 // tries to isolate the offending dependences into a new loop allowing 12 // vectorization of the remaining parts. 13 // 14 // For dependence analysis, the pass uses the LoopVectorizer's 15 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 16 // memory operations, special care is taken to preserve the lexical order of 17 // these operations. 18 // 19 // Similarly to the Vectorizer, the pass also supports loop versioning to 20 // run-time disambiguate potentially overlapping arrays. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/Scalar/LoopDistribute.h" 25 #include "llvm/ADT/DenseMap.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/Optional.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/SmallPtrSet.h" 31 #include "llvm/ADT/SmallVector.h" 32 #include "llvm/ADT/Statistic.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/ADT/Twine.h" 35 #include "llvm/ADT/iterator_range.h" 36 #include "llvm/Analysis/AssumptionCache.h" 37 #include "llvm/Analysis/GlobalsModRef.h" 38 #include "llvm/Analysis/LoopAccessAnalysis.h" 39 #include "llvm/Analysis/LoopAnalysisManager.h" 40 #include "llvm/Analysis/LoopInfo.h" 41 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 42 #include "llvm/Analysis/ScalarEvolution.h" 43 #include "llvm/Analysis/TargetLibraryInfo.h" 44 #include "llvm/Analysis/TargetTransformInfo.h" 45 #include "llvm/IR/BasicBlock.h" 46 #include "llvm/IR/Constants.h" 47 #include "llvm/IR/DiagnosticInfo.h" 48 #include "llvm/IR/Dominators.h" 49 #include "llvm/IR/Function.h" 50 #include "llvm/IR/Instruction.h" 51 #include "llvm/IR/Instructions.h" 52 #include "llvm/IR/LLVMContext.h" 53 #include "llvm/IR/Metadata.h" 54 #include "llvm/IR/PassManager.h" 55 #include "llvm/IR/Value.h" 56 #include "llvm/InitializePasses.h" 57 #include "llvm/Pass.h" 58 #include "llvm/Support/Casting.h" 59 #include "llvm/Support/CommandLine.h" 60 #include "llvm/Support/Debug.h" 61 #include "llvm/Support/raw_ostream.h" 62 #include "llvm/Transforms/Scalar.h" 63 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 64 #include "llvm/Transforms/Utils/Cloning.h" 65 #include "llvm/Transforms/Utils/LoopUtils.h" 66 #include "llvm/Transforms/Utils/LoopVersioning.h" 67 #include "llvm/Transforms/Utils/ValueMapper.h" 68 #include <cassert> 69 #include <functional> 70 #include <list> 71 #include <tuple> 72 #include <utility> 73 74 using namespace llvm; 75 76 #define LDIST_NAME "loop-distribute" 77 #define DEBUG_TYPE LDIST_NAME 78 79 /// @{ 80 /// Metadata attribute names 81 static const char *const LLVMLoopDistributeFollowupAll = 82 "llvm.loop.distribute.followup_all"; 83 static const char *const LLVMLoopDistributeFollowupCoincident = 84 "llvm.loop.distribute.followup_coincident"; 85 static const char *const LLVMLoopDistributeFollowupSequential = 86 "llvm.loop.distribute.followup_sequential"; 87 static const char *const LLVMLoopDistributeFollowupFallback = 88 "llvm.loop.distribute.followup_fallback"; 89 /// @} 90 91 static cl::opt<bool> 92 LDistVerify("loop-distribute-verify", cl::Hidden, 93 cl::desc("Turn on DominatorTree and LoopInfo verification " 94 "after Loop Distribution"), 95 cl::init(false)); 96 97 static cl::opt<bool> DistributeNonIfConvertible( 98 "loop-distribute-non-if-convertible", cl::Hidden, 99 cl::desc("Whether to distribute into a loop that may not be " 100 "if-convertible by the loop vectorizer"), 101 cl::init(false)); 102 103 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 104 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 105 cl::desc("The maximum number of SCEV checks allowed for Loop " 106 "Distribution")); 107 108 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 109 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 110 cl::Hidden, 111 cl::desc( 112 "The maximum number of SCEV checks allowed for Loop " 113 "Distribution for loop marked with #pragma loop distribute(enable)")); 114 115 static cl::opt<bool> EnableLoopDistribute( 116 "enable-loop-distribute", cl::Hidden, 117 cl::desc("Enable the new, experimental LoopDistribution Pass"), 118 cl::init(false)); 119 120 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 121 122 namespace { 123 124 /// Maintains the set of instructions of the loop for a partition before 125 /// cloning. After cloning, it hosts the new loop. 126 class InstPartition { 127 using InstructionSet = SmallPtrSet<Instruction *, 8>; 128 129 public: 130 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 131 : DepCycle(DepCycle), OrigLoop(L) { 132 Set.insert(I); 133 } 134 135 /// Returns whether this partition contains a dependence cycle. 136 bool hasDepCycle() const { return DepCycle; } 137 138 /// Adds an instruction to this partition. 139 void add(Instruction *I) { Set.insert(I); } 140 141 /// Collection accessors. 142 InstructionSet::iterator begin() { return Set.begin(); } 143 InstructionSet::iterator end() { return Set.end(); } 144 InstructionSet::const_iterator begin() const { return Set.begin(); } 145 InstructionSet::const_iterator end() const { return Set.end(); } 146 bool empty() const { return Set.empty(); } 147 148 /// Moves this partition into \p Other. This partition becomes empty 149 /// after this. 150 void moveTo(InstPartition &Other) { 151 Other.Set.insert(Set.begin(), Set.end()); 152 Set.clear(); 153 Other.DepCycle |= DepCycle; 154 } 155 156 /// Populates the partition with a transitive closure of all the 157 /// instructions that the seeded instructions dependent on. 158 void populateUsedSet() { 159 // FIXME: We currently don't use control-dependence but simply include all 160 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 161 // up. 162 for (auto *B : OrigLoop->getBlocks()) 163 Set.insert(B->getTerminator()); 164 165 // Follow the use-def chains to form a transitive closure of all the 166 // instructions that the originally seeded instructions depend on. 167 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 168 while (!Worklist.empty()) { 169 Instruction *I = Worklist.pop_back_val(); 170 // Insert instructions from the loop that we depend on. 171 for (Value *V : I->operand_values()) { 172 auto *I = dyn_cast<Instruction>(V); 173 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 174 Worklist.push_back(I); 175 } 176 } 177 } 178 179 /// Clones the original loop. 180 /// 181 /// Updates LoopInfo and DominatorTree using the information that block \p 182 /// LoopDomBB dominates the loop. 183 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 184 unsigned Index, LoopInfo *LI, 185 DominatorTree *DT) { 186 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 187 VMap, Twine(".ldist") + Twine(Index), 188 LI, DT, ClonedLoopBlocks); 189 return ClonedLoop; 190 } 191 192 /// The cloned loop. If this partition is mapped to the original loop, 193 /// this is null. 194 const Loop *getClonedLoop() const { return ClonedLoop; } 195 196 /// Returns the loop where this partition ends up after distribution. 197 /// If this partition is mapped to the original loop then use the block from 198 /// the loop. 199 Loop *getDistributedLoop() const { 200 return ClonedLoop ? ClonedLoop : OrigLoop; 201 } 202 203 /// The VMap that is populated by cloning and then used in 204 /// remapinstruction to remap the cloned instructions. 205 ValueToValueMapTy &getVMap() { return VMap; } 206 207 /// Remaps the cloned instructions using VMap. 208 void remapInstructions() { 209 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 210 } 211 212 /// Based on the set of instructions selected for this partition, 213 /// removes the unnecessary ones. 214 void removeUnusedInsts() { 215 SmallVector<Instruction *, 8> Unused; 216 217 for (auto *Block : OrigLoop->getBlocks()) 218 for (auto &Inst : *Block) 219 if (!Set.count(&Inst)) { 220 Instruction *NewInst = &Inst; 221 if (!VMap.empty()) 222 NewInst = cast<Instruction>(VMap[NewInst]); 223 224 assert(!isa<BranchInst>(NewInst) && 225 "Branches are marked used early on"); 226 Unused.push_back(NewInst); 227 } 228 229 // Delete the instructions backwards, as it has a reduced likelihood of 230 // having to update as many def-use and use-def chains. 231 for (auto *Inst : reverse(Unused)) { 232 if (!Inst->use_empty()) 233 Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); 234 Inst->eraseFromParent(); 235 } 236 } 237 238 void print() const { 239 if (DepCycle) 240 dbgs() << " (cycle)\n"; 241 for (auto *I : Set) 242 // Prefix with the block name. 243 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 244 } 245 246 void printBlocks() const { 247 for (auto *BB : getDistributedLoop()->getBlocks()) 248 dbgs() << *BB; 249 } 250 251 private: 252 /// Instructions from OrigLoop selected for this partition. 253 InstructionSet Set; 254 255 /// Whether this partition contains a dependence cycle. 256 bool DepCycle; 257 258 /// The original loop. 259 Loop *OrigLoop; 260 261 /// The cloned loop. If this partition is mapped to the original loop, 262 /// this is null. 263 Loop *ClonedLoop = nullptr; 264 265 /// The blocks of ClonedLoop including the preheader. If this 266 /// partition is mapped to the original loop, this is empty. 267 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 268 269 /// These gets populated once the set of instructions have been 270 /// finalized. If this partition is mapped to the original loop, these are not 271 /// set. 272 ValueToValueMapTy VMap; 273 }; 274 275 /// Holds the set of Partitions. It populates them, merges them and then 276 /// clones the loops. 277 class InstPartitionContainer { 278 using InstToPartitionIdT = DenseMap<Instruction *, int>; 279 280 public: 281 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 282 : L(L), LI(LI), DT(DT) {} 283 284 /// Returns the number of partitions. 285 unsigned getSize() const { return PartitionContainer.size(); } 286 287 /// Adds \p Inst into the current partition if that is marked to 288 /// contain cycles. Otherwise start a new partition for it. 289 void addToCyclicPartition(Instruction *Inst) { 290 // If the current partition is non-cyclic. Start a new one. 291 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 292 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 293 else 294 PartitionContainer.back().add(Inst); 295 } 296 297 /// Adds \p Inst into a partition that is not marked to contain 298 /// dependence cycles. 299 /// 300 // Initially we isolate memory instructions into as many partitions as 301 // possible, then later we may merge them back together. 302 void addToNewNonCyclicPartition(Instruction *Inst) { 303 PartitionContainer.emplace_back(Inst, L); 304 } 305 306 /// Merges adjacent non-cyclic partitions. 307 /// 308 /// The idea is that we currently only want to isolate the non-vectorizable 309 /// partition. We could later allow more distribution among these partition 310 /// too. 311 void mergeAdjacentNonCyclic() { 312 mergeAdjacentPartitionsIf( 313 [](const InstPartition *P) { return !P->hasDepCycle(); }); 314 } 315 316 /// If a partition contains only conditional stores, we won't vectorize 317 /// it. Try to merge it with a previous cyclic partition. 318 void mergeNonIfConvertible() { 319 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 320 if (Partition->hasDepCycle()) 321 return true; 322 323 // Now, check if all stores are conditional in this partition. 324 bool seenStore = false; 325 326 for (auto *Inst : *Partition) 327 if (isa<StoreInst>(Inst)) { 328 seenStore = true; 329 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 330 return false; 331 } 332 return seenStore; 333 }); 334 } 335 336 /// Merges the partitions according to various heuristics. 337 void mergeBeforePopulating() { 338 mergeAdjacentNonCyclic(); 339 if (!DistributeNonIfConvertible) 340 mergeNonIfConvertible(); 341 } 342 343 /// Merges partitions in order to ensure that no loads are duplicated. 344 /// 345 /// We can't duplicate loads because that could potentially reorder them. 346 /// LoopAccessAnalysis provides dependency information with the context that 347 /// the order of memory operation is preserved. 348 /// 349 /// Return if any partitions were merged. 350 bool mergeToAvoidDuplicatedLoads() { 351 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 352 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 353 354 LoadToPartitionT LoadToPartition; 355 ToBeMergedT ToBeMerged; 356 357 // Step through the partitions and create equivalence between partitions 358 // that contain the same load. Also put partitions in between them in the 359 // same equivalence class to avoid reordering of memory operations. 360 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 361 E = PartitionContainer.end(); 362 I != E; ++I) { 363 auto *PartI = &*I; 364 365 // If a load occurs in two partitions PartI and PartJ, merge all 366 // partitions (PartI, PartJ] into PartI. 367 for (Instruction *Inst : *PartI) 368 if (isa<LoadInst>(Inst)) { 369 bool NewElt; 370 LoadToPartitionT::iterator LoadToPart; 371 372 std::tie(LoadToPart, NewElt) = 373 LoadToPartition.insert(std::make_pair(Inst, PartI)); 374 if (!NewElt) { 375 LLVM_DEBUG(dbgs() 376 << "Merging partitions due to this load in multiple " 377 << "partitions: " << PartI << ", " << LoadToPart->second 378 << "\n" 379 << *Inst << "\n"); 380 381 auto PartJ = I; 382 do { 383 --PartJ; 384 ToBeMerged.unionSets(PartI, &*PartJ); 385 } while (&*PartJ != LoadToPart->second); 386 } 387 } 388 } 389 if (ToBeMerged.empty()) 390 return false; 391 392 // Merge the member of an equivalence class into its class leader. This 393 // makes the members empty. 394 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 395 I != E; ++I) { 396 if (!I->isLeader()) 397 continue; 398 399 auto PartI = I->getData(); 400 for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 401 ToBeMerged.member_end())) { 402 PartJ->moveTo(*PartI); 403 } 404 } 405 406 // Remove the empty partitions. 407 PartitionContainer.remove_if( 408 [](const InstPartition &P) { return P.empty(); }); 409 410 return true; 411 } 412 413 /// Sets up the mapping between instructions to partitions. If the 414 /// instruction is duplicated across multiple partitions, set the entry to -1. 415 void setupPartitionIdOnInstructions() { 416 int PartitionID = 0; 417 for (const auto &Partition : PartitionContainer) { 418 for (Instruction *Inst : Partition) { 419 bool NewElt; 420 InstToPartitionIdT::iterator Iter; 421 422 std::tie(Iter, NewElt) = 423 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 424 if (!NewElt) 425 Iter->second = -1; 426 } 427 ++PartitionID; 428 } 429 } 430 431 /// Populates the partition with everything that the seeding 432 /// instructions require. 433 void populateUsedSet() { 434 for (auto &P : PartitionContainer) 435 P.populateUsedSet(); 436 } 437 438 /// This performs the main chunk of the work of cloning the loops for 439 /// the partitions. 440 void cloneLoops() { 441 BasicBlock *OrigPH = L->getLoopPreheader(); 442 // At this point the predecessor of the preheader is either the memcheck 443 // block or the top part of the original preheader. 444 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 445 assert(Pred && "Preheader does not have a single predecessor"); 446 BasicBlock *ExitBlock = L->getExitBlock(); 447 assert(ExitBlock && "No single exit block"); 448 Loop *NewLoop; 449 450 assert(!PartitionContainer.empty() && "at least two partitions expected"); 451 // We're cloning the preheader along with the loop so we already made sure 452 // it was empty. 453 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 454 "preheader not empty"); 455 456 // Preserve the original loop ID for use after the transformation. 457 MDNode *OrigLoopID = L->getLoopID(); 458 459 // Create a loop for each partition except the last. Clone the original 460 // loop before PH along with adding a preheader for the cloned loop. Then 461 // update PH to point to the newly added preheader. 462 BasicBlock *TopPH = OrigPH; 463 unsigned Index = getSize() - 1; 464 for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) { 465 NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 466 467 Part.getVMap()[ExitBlock] = TopPH; 468 Part.remapInstructions(); 469 setNewLoopID(OrigLoopID, &Part); 470 --Index; 471 TopPH = NewLoop->getLoopPreheader(); 472 } 473 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 474 475 // Also set a new loop ID for the last loop. 476 setNewLoopID(OrigLoopID, &PartitionContainer.back()); 477 478 // Now go in forward order and update the immediate dominator for the 479 // preheaders with the exiting block of the previous loop. Dominance 480 // within the loop is updated in cloneLoopWithPreheader. 481 for (auto Curr = PartitionContainer.cbegin(), 482 Next = std::next(PartitionContainer.cbegin()), 483 E = PartitionContainer.cend(); 484 Next != E; ++Curr, ++Next) 485 DT->changeImmediateDominator( 486 Next->getDistributedLoop()->getLoopPreheader(), 487 Curr->getDistributedLoop()->getExitingBlock()); 488 } 489 490 /// Removes the dead instructions from the cloned loops. 491 void removeUnusedInsts() { 492 for (auto &Partition : PartitionContainer) 493 Partition.removeUnusedInsts(); 494 } 495 496 /// For each memory pointer, it computes the partitionId the pointer is 497 /// used in. 498 /// 499 /// This returns an array of int where the I-th entry corresponds to I-th 500 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 501 /// partitions its entry is set to -1. 502 SmallVector<int, 8> 503 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 504 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 505 506 unsigned N = RtPtrCheck->Pointers.size(); 507 SmallVector<int, 8> PtrToPartitions(N); 508 for (unsigned I = 0; I < N; ++I) { 509 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 510 auto Instructions = 511 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 512 513 int &Partition = PtrToPartitions[I]; 514 // First set it to uninitialized. 515 Partition = -2; 516 for (Instruction *Inst : Instructions) { 517 // Note that this could be -1 if Inst is duplicated across multiple 518 // partitions. 519 int ThisPartition = this->InstToPartitionId[Inst]; 520 if (Partition == -2) 521 Partition = ThisPartition; 522 // -1 means belonging to multiple partitions. 523 else if (Partition == -1) 524 break; 525 else if (Partition != (int)ThisPartition) 526 Partition = -1; 527 } 528 assert(Partition != -2 && "Pointer not belonging to any partition"); 529 } 530 531 return PtrToPartitions; 532 } 533 534 void print(raw_ostream &OS) const { 535 unsigned Index = 0; 536 for (const auto &P : PartitionContainer) { 537 OS << "Partition " << Index++ << " (" << &P << "):\n"; 538 P.print(); 539 } 540 } 541 542 void dump() const { print(dbgs()); } 543 544 #ifndef NDEBUG 545 friend raw_ostream &operator<<(raw_ostream &OS, 546 const InstPartitionContainer &Partitions) { 547 Partitions.print(OS); 548 return OS; 549 } 550 #endif 551 552 void printBlocks() const { 553 unsigned Index = 0; 554 for (const auto &P : PartitionContainer) { 555 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 556 P.printBlocks(); 557 } 558 } 559 560 private: 561 using PartitionContainerT = std::list<InstPartition>; 562 563 /// List of partitions. 564 PartitionContainerT PartitionContainer; 565 566 /// Mapping from Instruction to partition Id. If the instruction 567 /// belongs to multiple partitions the entry contains -1. 568 InstToPartitionIdT InstToPartitionId; 569 570 Loop *L; 571 LoopInfo *LI; 572 DominatorTree *DT; 573 574 /// The control structure to merge adjacent partitions if both satisfy 575 /// the \p Predicate. 576 template <class UnaryPredicate> 577 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 578 InstPartition *PrevMatch = nullptr; 579 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 580 auto DoesMatch = Predicate(&*I); 581 if (PrevMatch == nullptr && DoesMatch) { 582 PrevMatch = &*I; 583 ++I; 584 } else if (PrevMatch != nullptr && DoesMatch) { 585 I->moveTo(*PrevMatch); 586 I = PartitionContainer.erase(I); 587 } else { 588 PrevMatch = nullptr; 589 ++I; 590 } 591 } 592 } 593 594 /// Assign new LoopIDs for the partition's cloned loop. 595 void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 596 std::optional<MDNode *> PartitionID = makeFollowupLoopID( 597 OrigLoopID, 598 {LLVMLoopDistributeFollowupAll, 599 Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 600 : LLVMLoopDistributeFollowupCoincident}); 601 if (PartitionID) { 602 Loop *NewLoop = Part->getDistributedLoop(); 603 NewLoop->setLoopID(PartitionID.value()); 604 } 605 } 606 }; 607 608 /// For each memory instruction, this class maintains difference of the 609 /// number of unsafe dependences that start out from this instruction minus 610 /// those that end here. 611 /// 612 /// By traversing the memory instructions in program order and accumulating this 613 /// number, we know whether any unsafe dependence crosses over a program point. 614 class MemoryInstructionDependences { 615 using Dependence = MemoryDepChecker::Dependence; 616 617 public: 618 struct Entry { 619 Instruction *Inst; 620 unsigned NumUnsafeDependencesStartOrEnd = 0; 621 622 Entry(Instruction *Inst) : Inst(Inst) {} 623 }; 624 625 using AccessesType = SmallVector<Entry, 8>; 626 627 AccessesType::const_iterator begin() const { return Accesses.begin(); } 628 AccessesType::const_iterator end() const { return Accesses.end(); } 629 630 MemoryInstructionDependences( 631 const SmallVectorImpl<Instruction *> &Instructions, 632 const SmallVectorImpl<Dependence> &Dependences) { 633 Accesses.append(Instructions.begin(), Instructions.end()); 634 635 LLVM_DEBUG(dbgs() << "Backward dependences:\n"); 636 for (const auto &Dep : Dependences) 637 if (Dep.isPossiblyBackward()) { 638 // Note that the designations source and destination follow the program 639 // order, i.e. source is always first. (The direction is given by the 640 // DepType.) 641 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 642 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 643 644 LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 645 } 646 } 647 648 private: 649 AccessesType Accesses; 650 }; 651 652 /// The actual class performing the per-loop work. 653 class LoopDistributeForLoop { 654 public: 655 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 656 ScalarEvolution *SE, LoopAccessInfoManager &LAIs, 657 OptimizationRemarkEmitter *ORE) 658 : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) { 659 setForced(); 660 } 661 662 /// Try to distribute an inner-most loop. 663 bool processLoop() { 664 assert(L->isInnermost() && "Only process inner loops."); 665 666 LLVM_DEBUG(dbgs() << "\nLDist: In \"" 667 << L->getHeader()->getParent()->getName() 668 << "\" checking " << *L << "\n"); 669 670 // Having a single exit block implies there's also one exiting block. 671 if (!L->getExitBlock()) 672 return fail("MultipleExitBlocks", "multiple exit blocks"); 673 if (!L->isLoopSimplifyForm()) 674 return fail("NotLoopSimplifyForm", 675 "loop is not in loop-simplify form"); 676 if (!L->isRotatedForm()) 677 return fail("NotBottomTested", "loop is not bottom tested"); 678 679 BasicBlock *PH = L->getLoopPreheader(); 680 681 LAI = &LAIs.getInfo(*L); 682 683 // Currently, we only distribute to isolate the part of the loop with 684 // dependence cycles to enable partial vectorization. 685 if (LAI->canVectorizeMemory()) 686 return fail("MemOpsCanBeVectorized", 687 "memory operations are safe for vectorization"); 688 689 auto *Dependences = LAI->getDepChecker().getDependences(); 690 if (!Dependences || Dependences->empty()) 691 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 692 693 InstPartitionContainer Partitions(L, LI, DT); 694 695 // First, go through each memory operation and assign them to consecutive 696 // partitions (the order of partitions follows program order). Put those 697 // with unsafe dependences into "cyclic" partition otherwise put each store 698 // in its own "non-cyclic" partition (we'll merge these later). 699 // 700 // Note that a memory operation (e.g. Load2 below) at a program point that 701 // has an unsafe dependence (Store3->Load1) spanning over it must be 702 // included in the same cyclic partition as the dependent operations. This 703 // is to preserve the original program order after distribution. E.g.: 704 // 705 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 706 // Load1 -. 1 0->1 707 // Load2 | /Unsafe/ 0 1 708 // Store3 -' -1 1->0 709 // Load4 0 0 710 // 711 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 712 // we just keep assigning to the same cyclic partition until 713 // NumUnsafeDependencesActive reaches 0. 714 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 715 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 716 *Dependences); 717 718 int NumUnsafeDependencesActive = 0; 719 for (const auto &InstDep : MID) { 720 Instruction *I = InstDep.Inst; 721 // We update NumUnsafeDependencesActive post-instruction, catch the 722 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 723 if (NumUnsafeDependencesActive || 724 InstDep.NumUnsafeDependencesStartOrEnd > 0) 725 Partitions.addToCyclicPartition(I); 726 else 727 Partitions.addToNewNonCyclicPartition(I); 728 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 729 assert(NumUnsafeDependencesActive >= 0 && 730 "Negative number of dependences active"); 731 } 732 733 // Add partitions for values used outside. These partitions can be out of 734 // order from the original program order. This is OK because if the 735 // partition uses a load we will merge this partition with the original 736 // partition of the load that we set up in the previous loop (see 737 // mergeToAvoidDuplicatedLoads). 738 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 739 for (auto *Inst : DefsUsedOutside) 740 Partitions.addToNewNonCyclicPartition(Inst); 741 742 LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 743 if (Partitions.getSize() < 2) 744 return fail("CantIsolateUnsafeDeps", 745 "cannot isolate unsafe dependencies"); 746 747 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 748 // should be able to vectorize these together. 749 Partitions.mergeBeforePopulating(); 750 LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 751 if (Partitions.getSize() < 2) 752 return fail("CantIsolateUnsafeDeps", 753 "cannot isolate unsafe dependencies"); 754 755 // Now, populate the partitions with non-memory operations. 756 Partitions.populateUsedSet(); 757 LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 758 759 // In order to preserve original lexical order for loads, keep them in the 760 // partition that we set up in the MemoryInstructionDependences loop. 761 if (Partitions.mergeToAvoidDuplicatedLoads()) { 762 LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 763 << Partitions); 764 if (Partitions.getSize() < 2) 765 return fail("CantIsolateUnsafeDeps", 766 "cannot isolate unsafe dependencies"); 767 } 768 769 // Don't distribute the loop if we need too many SCEV run-time checks, or 770 // any if it's illegal. 771 const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); 772 if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 773 return fail("RuntimeCheckWithConvergent", 774 "may not insert runtime check with convergent operation"); 775 } 776 777 if (Pred.getComplexity() > (IsForced.value_or(false) 778 ? PragmaDistributeSCEVCheckThreshold 779 : DistributeSCEVCheckThreshold)) 780 return fail("TooManySCEVRuntimeChecks", 781 "too many SCEV run-time checks needed.\n"); 782 783 if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) 784 return fail("HeuristicDisabled", "distribution heuristic disabled"); 785 786 LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 787 // We're done forming the partitions set up the reverse mapping from 788 // instructions to partitions. 789 Partitions.setupPartitionIdOnInstructions(); 790 791 // If we need run-time checks, version the loop now. 792 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 793 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 794 const auto &AllChecks = RtPtrChecking->getChecks(); 795 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 796 RtPtrChecking); 797 798 if (LAI->hasConvergentOp() && !Checks.empty()) { 799 return fail("RuntimeCheckWithConvergent", 800 "may not insert runtime check with convergent operation"); 801 } 802 803 // To keep things simple have an empty preheader before we version or clone 804 // the loop. (Also split if this has no predecessor, i.e. entry, because we 805 // rely on PH having a predecessor.) 806 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 807 SplitBlock(PH, PH->getTerminator(), DT, LI); 808 809 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 810 assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 811 812 MDNode *OrigLoopID = L->getLoopID(); 813 814 LLVM_DEBUG(dbgs() << "\nPointers:\n"); 815 LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 816 LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); 817 LVer.versionLoop(DefsUsedOutside); 818 LVer.annotateLoopWithNoAlias(); 819 820 // The unversioned loop will not be changed, so we inherit all attributes 821 // from the original loop, but remove the loop distribution metadata to 822 // avoid to distribute it again. 823 MDNode *UnversionedLoopID = 824 makeFollowupLoopID(OrigLoopID, 825 {LLVMLoopDistributeFollowupAll, 826 LLVMLoopDistributeFollowupFallback}, 827 "llvm.loop.distribute.", true) 828 .value(); 829 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 830 } 831 832 // Create identical copies of the original loop for each partition and hook 833 // them up sequentially. 834 Partitions.cloneLoops(); 835 836 // Now, we remove the instruction from each loop that don't belong to that 837 // partition. 838 Partitions.removeUnusedInsts(); 839 LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 840 LLVM_DEBUG(Partitions.printBlocks()); 841 842 if (LDistVerify) { 843 LI->verify(*DT); 844 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 845 } 846 847 ++NumLoopsDistributed; 848 // Report the success. 849 ORE->emit([&]() { 850 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 851 L->getHeader()) 852 << "distributed loop"; 853 }); 854 return true; 855 } 856 857 /// Provide diagnostics then \return with false. 858 bool fail(StringRef RemarkName, StringRef Message) { 859 LLVMContext &Ctx = F->getContext(); 860 bool Forced = isForced().value_or(false); 861 862 LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n"); 863 864 // With Rpass-missed report that distribution failed. 865 ORE->emit([&]() { 866 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 867 L->getStartLoc(), L->getHeader()) 868 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 869 "more " 870 "info"; 871 }); 872 873 // With Rpass-analysis report why. This is on by default if distribution 874 // was requested explicitly. 875 ORE->emit(OptimizationRemarkAnalysis( 876 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 877 RemarkName, L->getStartLoc(), L->getHeader()) 878 << "loop not distributed: " << Message); 879 880 // Also issue a warning if distribution was requested explicitly but it 881 // failed. 882 if (Forced) 883 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 884 *F, L->getStartLoc(), "loop not distributed: failed " 885 "explicitly specified loop distribution")); 886 887 return false; 888 } 889 890 /// Return if distribution forced to be enabled/disabled for the loop. 891 /// 892 /// If the optional has a value, it indicates whether distribution was forced 893 /// to be enabled (true) or disabled (false). If the optional has no value 894 /// distribution was not forced either way. 895 const std::optional<bool> &isForced() const { return IsForced; } 896 897 private: 898 /// Filter out checks between pointers from the same partition. 899 /// 900 /// \p PtrToPartition contains the partition number for pointers. Partition 901 /// number -1 means that the pointer is used in multiple partitions. In this 902 /// case we can't safely omit the check. 903 SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 904 const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 905 const SmallVectorImpl<int> &PtrToPartition, 906 const RuntimePointerChecking *RtPtrChecking) { 907 SmallVector<RuntimePointerCheck, 4> Checks; 908 909 copy_if(AllChecks, std::back_inserter(Checks), 910 [&](const RuntimePointerCheck &Check) { 911 for (unsigned PtrIdx1 : Check.first->Members) 912 for (unsigned PtrIdx2 : Check.second->Members) 913 // Only include this check if there is a pair of pointers 914 // that require checking and the pointers fall into 915 // separate partitions. 916 // 917 // (Note that we already know at this point that the two 918 // pointer groups need checking but it doesn't follow 919 // that each pair of pointers within the two groups need 920 // checking as well. 921 // 922 // In other words we don't want to include a check just 923 // because there is a pair of pointers between the two 924 // pointer groups that require checks and a different 925 // pair whose pointers fall into different partitions.) 926 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 927 !RuntimePointerChecking::arePointersInSamePartition( 928 PtrToPartition, PtrIdx1, PtrIdx2)) 929 return true; 930 return false; 931 }); 932 933 return Checks; 934 } 935 936 /// Check whether the loop metadata is forcing distribution to be 937 /// enabled/disabled. 938 void setForced() { 939 std::optional<const MDOperand *> Value = 940 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 941 if (!Value) 942 return; 943 944 const MDOperand *Op = *Value; 945 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 946 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 947 } 948 949 Loop *L; 950 Function *F; 951 952 // Analyses used. 953 LoopInfo *LI; 954 const LoopAccessInfo *LAI = nullptr; 955 DominatorTree *DT; 956 ScalarEvolution *SE; 957 LoopAccessInfoManager &LAIs; 958 OptimizationRemarkEmitter *ORE; 959 960 /// Indicates whether distribution is forced to be enabled/disabled for 961 /// the loop. 962 /// 963 /// If the optional has a value, it indicates whether distribution was forced 964 /// to be enabled (true) or disabled (false). If the optional has no value 965 /// distribution was not forced either way. 966 std::optional<bool> IsForced; 967 }; 968 969 } // end anonymous namespace 970 971 /// Shared implementation between new and old PMs. 972 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 973 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 974 LoopAccessInfoManager &LAIs) { 975 // Build up a worklist of inner-loops to vectorize. This is necessary as the 976 // act of distributing a loop creates new loops and can invalidate iterators 977 // across the loops. 978 SmallVector<Loop *, 8> Worklist; 979 980 for (Loop *TopLevelLoop : *LI) 981 for (Loop *L : depth_first(TopLevelLoop)) 982 // We only handle inner-most loops. 983 if (L->isInnermost()) 984 Worklist.push_back(L); 985 986 // Now walk the identified inner loops. 987 bool Changed = false; 988 for (Loop *L : Worklist) { 989 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE); 990 991 // If distribution was forced for the specific loop to be 992 // enabled/disabled, follow that. Otherwise use the global flag. 993 if (LDL.isForced().value_or(EnableLoopDistribute)) 994 Changed |= LDL.processLoop(); 995 } 996 997 // Process each loop nest in the function. 998 return Changed; 999 } 1000 1001 namespace { 1002 1003 /// The pass class. 1004 class LoopDistributeLegacy : public FunctionPass { 1005 public: 1006 static char ID; 1007 1008 LoopDistributeLegacy() : FunctionPass(ID) { 1009 // The default is set by the caller. 1010 initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); 1011 } 1012 1013 bool runOnFunction(Function &F) override { 1014 if (skipFunction(F)) 1015 return false; 1016 1017 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1018 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1019 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1020 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1021 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); 1022 1023 return runImpl(F, LI, DT, SE, ORE, LAIs); 1024 } 1025 1026 void getAnalysisUsage(AnalysisUsage &AU) const override { 1027 AU.addRequired<ScalarEvolutionWrapperPass>(); 1028 AU.addRequired<LoopInfoWrapperPass>(); 1029 AU.addPreserved<LoopInfoWrapperPass>(); 1030 AU.addRequired<LoopAccessLegacyAnalysis>(); 1031 AU.addRequired<DominatorTreeWrapperPass>(); 1032 AU.addPreserved<DominatorTreeWrapperPass>(); 1033 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1034 AU.addPreserved<GlobalsAAWrapperPass>(); 1035 } 1036 }; 1037 1038 } // end anonymous namespace 1039 1040 PreservedAnalyses LoopDistributePass::run(Function &F, 1041 FunctionAnalysisManager &AM) { 1042 auto &LI = AM.getResult<LoopAnalysis>(F); 1043 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1044 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1045 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1046 1047 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 1048 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs); 1049 if (!Changed) 1050 return PreservedAnalyses::all(); 1051 PreservedAnalyses PA; 1052 PA.preserve<LoopAnalysis>(); 1053 PA.preserve<DominatorTreeAnalysis>(); 1054 return PA; 1055 } 1056 1057 char LoopDistributeLegacy::ID; 1058 1059 static const char ldist_name[] = "Loop Distribution"; 1060 1061 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, 1062 false) 1063 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1064 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 1065 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1066 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1067 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1068 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) 1069 1070 FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); } 1071