1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Loop Distribution Pass. Its main focus is to 10 // distribute loops that cannot be vectorized due to dependence cycles. It 11 // tries to isolate the offending dependences into a new loop allowing 12 // vectorization of the remaining parts. 13 // 14 // For dependence analysis, the pass uses the LoopVectorizer's 15 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 16 // memory operations, special care is taken to preserve the lexical order of 17 // these operations. 18 // 19 // Similarly to the Vectorizer, the pass also supports loop versioning to 20 // run-time disambiguate potentially overlapping arrays. 21 // 22 //===----------------------------------------------------------------------===// 23 24 #include "llvm/Transforms/Scalar/LoopDistribute.h" 25 #include "llvm/ADT/DenseMap.h" 26 #include "llvm/ADT/DepthFirstIterator.h" 27 #include "llvm/ADT/EquivalenceClasses.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/SetVector.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/StringRef.h" 33 #include "llvm/ADT/Twine.h" 34 #include "llvm/ADT/iterator_range.h" 35 #include "llvm/Analysis/AssumptionCache.h" 36 #include "llvm/Analysis/GlobalsModRef.h" 37 #include "llvm/Analysis/LoopAccessAnalysis.h" 38 #include "llvm/Analysis/LoopAnalysisManager.h" 39 #include "llvm/Analysis/LoopInfo.h" 40 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 41 #include "llvm/Analysis/ScalarEvolution.h" 42 #include "llvm/Analysis/TargetLibraryInfo.h" 43 #include "llvm/Analysis/TargetTransformInfo.h" 44 #include "llvm/IR/BasicBlock.h" 45 #include "llvm/IR/Constants.h" 46 #include "llvm/IR/DiagnosticInfo.h" 47 #include "llvm/IR/Dominators.h" 48 #include "llvm/IR/Function.h" 49 #include "llvm/IR/Instruction.h" 50 #include "llvm/IR/Instructions.h" 51 #include "llvm/IR/LLVMContext.h" 52 #include "llvm/IR/Metadata.h" 53 #include "llvm/IR/PassManager.h" 54 #include "llvm/IR/Value.h" 55 #include "llvm/Support/Casting.h" 56 #include "llvm/Support/CommandLine.h" 57 #include "llvm/Support/Debug.h" 58 #include "llvm/Support/raw_ostream.h" 59 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 60 #include "llvm/Transforms/Utils/Cloning.h" 61 #include "llvm/Transforms/Utils/LoopUtils.h" 62 #include "llvm/Transforms/Utils/LoopVersioning.h" 63 #include "llvm/Transforms/Utils/ValueMapper.h" 64 #include <cassert> 65 #include <list> 66 #include <tuple> 67 #include <utility> 68 69 using namespace llvm; 70 71 #define LDIST_NAME "loop-distribute" 72 #define DEBUG_TYPE LDIST_NAME 73 74 /// @{ 75 /// Metadata attribute names 76 static const char *const LLVMLoopDistributeFollowupAll = 77 "llvm.loop.distribute.followup_all"; 78 static const char *const LLVMLoopDistributeFollowupCoincident = 79 "llvm.loop.distribute.followup_coincident"; 80 static const char *const LLVMLoopDistributeFollowupSequential = 81 "llvm.loop.distribute.followup_sequential"; 82 static const char *const LLVMLoopDistributeFollowupFallback = 83 "llvm.loop.distribute.followup_fallback"; 84 /// @} 85 86 static cl::opt<bool> 87 LDistVerify("loop-distribute-verify", cl::Hidden, 88 cl::desc("Turn on DominatorTree and LoopInfo verification " 89 "after Loop Distribution"), 90 cl::init(false)); 91 92 static cl::opt<bool> DistributeNonIfConvertible( 93 "loop-distribute-non-if-convertible", cl::Hidden, 94 cl::desc("Whether to distribute into a loop that may not be " 95 "if-convertible by the loop vectorizer"), 96 cl::init(false)); 97 98 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 99 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 100 cl::desc("The maximum number of SCEV checks allowed for Loop " 101 "Distribution")); 102 103 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 104 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 105 cl::Hidden, 106 cl::desc("The maximum number of SCEV checks allowed for Loop " 107 "Distribution for loop marked with #pragma clang loop " 108 "distribute(enable)")); 109 110 static cl::opt<bool> EnableLoopDistribute( 111 "enable-loop-distribute", cl::Hidden, 112 cl::desc("Enable the new, experimental LoopDistribution Pass"), 113 cl::init(false)); 114 115 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 116 117 namespace { 118 119 /// Maintains the set of instructions of the loop for a partition before 120 /// cloning. After cloning, it hosts the new loop. 121 class InstPartition { 122 using InstructionSet = SmallSetVector<Instruction *, 8>; 123 124 public: 125 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 126 : DepCycle(DepCycle), OrigLoop(L) { 127 Set.insert(I); 128 } 129 130 /// Returns whether this partition contains a dependence cycle. 131 bool hasDepCycle() const { return DepCycle; } 132 133 /// Adds an instruction to this partition. 134 void add(Instruction *I) { Set.insert(I); } 135 136 /// Collection accessors. 137 InstructionSet::iterator begin() { return Set.begin(); } 138 InstructionSet::iterator end() { return Set.end(); } 139 InstructionSet::const_iterator begin() const { return Set.begin(); } 140 InstructionSet::const_iterator end() const { return Set.end(); } 141 bool empty() const { return Set.empty(); } 142 143 /// Moves this partition into \p Other. This partition becomes empty 144 /// after this. 145 void moveTo(InstPartition &Other) { 146 Other.Set.insert(Set.begin(), Set.end()); 147 Set.clear(); 148 Other.DepCycle |= DepCycle; 149 } 150 151 /// Populates the partition with a transitive closure of all the 152 /// instructions that the seeded instructions dependent on. 153 void populateUsedSet() { 154 // FIXME: We currently don't use control-dependence but simply include all 155 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 156 // up. 157 for (auto *B : OrigLoop->getBlocks()) 158 Set.insert(B->getTerminator()); 159 160 // Follow the use-def chains to form a transitive closure of all the 161 // instructions that the originally seeded instructions depend on. 162 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 163 while (!Worklist.empty()) { 164 Instruction *I = Worklist.pop_back_val(); 165 // Insert instructions from the loop that we depend on. 166 for (Value *V : I->operand_values()) { 167 auto *I = dyn_cast<Instruction>(V); 168 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I)) 169 Worklist.push_back(I); 170 } 171 } 172 } 173 174 /// Clones the original loop. 175 /// 176 /// Updates LoopInfo and DominatorTree using the information that block \p 177 /// LoopDomBB dominates the loop. 178 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 179 unsigned Index, LoopInfo *LI, 180 DominatorTree *DT) { 181 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 182 VMap, Twine(".ldist") + Twine(Index), 183 LI, DT, ClonedLoopBlocks); 184 return ClonedLoop; 185 } 186 187 /// The cloned loop. If this partition is mapped to the original loop, 188 /// this is null. 189 const Loop *getClonedLoop() const { return ClonedLoop; } 190 191 /// Returns the loop where this partition ends up after distribution. 192 /// If this partition is mapped to the original loop then use the block from 193 /// the loop. 194 Loop *getDistributedLoop() const { 195 return ClonedLoop ? ClonedLoop : OrigLoop; 196 } 197 198 /// The VMap that is populated by cloning and then used in 199 /// remapinstruction to remap the cloned instructions. 200 ValueToValueMapTy &getVMap() { return VMap; } 201 202 /// Remaps the cloned instructions using VMap. 203 void remapInstructions() { 204 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 205 } 206 207 /// Based on the set of instructions selected for this partition, 208 /// removes the unnecessary ones. 209 void removeUnusedInsts() { 210 SmallVector<Instruction *, 8> Unused; 211 212 for (auto *Block : OrigLoop->getBlocks()) 213 for (auto &Inst : *Block) 214 if (!Set.count(&Inst)) { 215 Instruction *NewInst = &Inst; 216 if (!VMap.empty()) 217 NewInst = cast<Instruction>(VMap[NewInst]); 218 219 assert(!isa<BranchInst>(NewInst) && 220 "Branches are marked used early on"); 221 Unused.push_back(NewInst); 222 } 223 224 // Delete the instructions backwards, as it has a reduced likelihood of 225 // having to update as many def-use and use-def chains. 226 for (auto *Inst : reverse(Unused)) { 227 if (!Inst->use_empty()) 228 Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); 229 Inst->eraseFromParent(); 230 } 231 } 232 233 void print(raw_ostream &OS) const { 234 OS << (DepCycle ? " (cycle)\n" : "\n"); 235 for (auto *I : Set) 236 // Prefix with the block name. 237 OS << " " << I->getParent()->getName() << ":" << *I << "\n"; 238 } 239 240 void printBlocks(raw_ostream &OS) const { 241 for (auto *BB : getDistributedLoop()->getBlocks()) 242 OS << *BB; 243 } 244 245 private: 246 /// Instructions from OrigLoop selected for this partition. 247 InstructionSet Set; 248 249 /// Whether this partition contains a dependence cycle. 250 bool DepCycle; 251 252 /// The original loop. 253 Loop *OrigLoop; 254 255 /// The cloned loop. If this partition is mapped to the original loop, 256 /// this is null. 257 Loop *ClonedLoop = nullptr; 258 259 /// The blocks of ClonedLoop including the preheader. If this 260 /// partition is mapped to the original loop, this is empty. 261 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 262 263 /// These gets populated once the set of instructions have been 264 /// finalized. If this partition is mapped to the original loop, these are not 265 /// set. 266 ValueToValueMapTy VMap; 267 }; 268 269 /// Holds the set of Partitions. It populates them, merges them and then 270 /// clones the loops. 271 class InstPartitionContainer { 272 using InstToPartitionIdT = DenseMap<Instruction *, int>; 273 274 public: 275 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 276 : L(L), LI(LI), DT(DT) {} 277 278 /// Returns the number of partitions. 279 unsigned getSize() const { return PartitionContainer.size(); } 280 281 /// Adds \p Inst into the current partition if that is marked to 282 /// contain cycles. Otherwise start a new partition for it. 283 void addToCyclicPartition(Instruction *Inst) { 284 // If the current partition is non-cyclic. Start a new one. 285 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 286 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 287 else 288 PartitionContainer.back().add(Inst); 289 } 290 291 /// Adds \p Inst into a partition that is not marked to contain 292 /// dependence cycles. 293 /// 294 // Initially we isolate memory instructions into as many partitions as 295 // possible, then later we may merge them back together. 296 void addToNewNonCyclicPartition(Instruction *Inst) { 297 PartitionContainer.emplace_back(Inst, L); 298 } 299 300 /// Merges adjacent non-cyclic partitions. 301 /// 302 /// The idea is that we currently only want to isolate the non-vectorizable 303 /// partition. We could later allow more distribution among these partition 304 /// too. 305 void mergeAdjacentNonCyclic() { 306 mergeAdjacentPartitionsIf( 307 [](const InstPartition *P) { return !P->hasDepCycle(); }); 308 } 309 310 /// If a partition contains only conditional stores, we won't vectorize 311 /// it. Try to merge it with a previous cyclic partition. 312 void mergeNonIfConvertible() { 313 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 314 if (Partition->hasDepCycle()) 315 return true; 316 317 // Now, check if all stores are conditional in this partition. 318 bool seenStore = false; 319 320 for (auto *Inst : *Partition) 321 if (isa<StoreInst>(Inst)) { 322 seenStore = true; 323 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 324 return false; 325 } 326 return seenStore; 327 }); 328 } 329 330 /// Merges the partitions according to various heuristics. 331 void mergeBeforePopulating() { 332 mergeAdjacentNonCyclic(); 333 if (!DistributeNonIfConvertible) 334 mergeNonIfConvertible(); 335 } 336 337 /// Merges partitions in order to ensure that no loads are duplicated. 338 /// 339 /// We can't duplicate loads because that could potentially reorder them. 340 /// LoopAccessAnalysis provides dependency information with the context that 341 /// the order of memory operation is preserved. 342 /// 343 /// Return if any partitions were merged. 344 bool mergeToAvoidDuplicatedLoads() { 345 using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 346 using ToBeMergedT = EquivalenceClasses<InstPartition *>; 347 348 LoadToPartitionT LoadToPartition; 349 ToBeMergedT ToBeMerged; 350 351 // Step through the partitions and create equivalence between partitions 352 // that contain the same load. Also put partitions in between them in the 353 // same equivalence class to avoid reordering of memory operations. 354 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 355 E = PartitionContainer.end(); 356 I != E; ++I) { 357 auto *PartI = &*I; 358 359 // If a load occurs in two partitions PartI and PartJ, merge all 360 // partitions (PartI, PartJ] into PartI. 361 for (Instruction *Inst : *PartI) 362 if (isa<LoadInst>(Inst)) { 363 bool NewElt; 364 LoadToPartitionT::iterator LoadToPart; 365 366 std::tie(LoadToPart, NewElt) = 367 LoadToPartition.insert(std::make_pair(Inst, PartI)); 368 if (!NewElt) { 369 LLVM_DEBUG( 370 dbgs() 371 << "LDist: Merging partitions due to this load in multiple " 372 << "partitions: " << PartI << ", " << LoadToPart->second << "\n" 373 << *Inst << "\n"); 374 375 auto PartJ = I; 376 do { 377 --PartJ; 378 ToBeMerged.unionSets(PartI, &*PartJ); 379 } while (&*PartJ != LoadToPart->second); 380 } 381 } 382 } 383 if (ToBeMerged.empty()) 384 return false; 385 386 // Merge the member of an equivalence class into its class leader. This 387 // makes the members empty. 388 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 389 I != E; ++I) { 390 if (!I->isLeader()) 391 continue; 392 393 auto PartI = I->getData(); 394 for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 395 ToBeMerged.member_end())) { 396 PartJ->moveTo(*PartI); 397 } 398 } 399 400 // Remove the empty partitions. 401 PartitionContainer.remove_if( 402 [](const InstPartition &P) { return P.empty(); }); 403 404 return true; 405 } 406 407 /// Sets up the mapping between instructions to partitions. If the 408 /// instruction is duplicated across multiple partitions, set the entry to -1. 409 void setupPartitionIdOnInstructions() { 410 int PartitionID = 0; 411 for (const auto &Partition : PartitionContainer) { 412 for (Instruction *Inst : Partition) { 413 bool NewElt; 414 InstToPartitionIdT::iterator Iter; 415 416 std::tie(Iter, NewElt) = 417 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 418 if (!NewElt) 419 Iter->second = -1; 420 } 421 ++PartitionID; 422 } 423 } 424 425 /// Populates the partition with everything that the seeding 426 /// instructions require. 427 void populateUsedSet() { 428 for (auto &P : PartitionContainer) 429 P.populateUsedSet(); 430 } 431 432 /// This performs the main chunk of the work of cloning the loops for 433 /// the partitions. 434 void cloneLoops() { 435 BasicBlock *OrigPH = L->getLoopPreheader(); 436 // At this point the predecessor of the preheader is either the memcheck 437 // block or the top part of the original preheader. 438 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 439 assert(Pred && "Preheader does not have a single predecessor"); 440 BasicBlock *ExitBlock = L->getExitBlock(); 441 assert(ExitBlock && "No single exit block"); 442 Loop *NewLoop; 443 444 assert(!PartitionContainer.empty() && "at least two partitions expected"); 445 // We're cloning the preheader along with the loop so we already made sure 446 // it was empty. 447 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 448 "preheader not empty"); 449 450 // Preserve the original loop ID for use after the transformation. 451 MDNode *OrigLoopID = L->getLoopID(); 452 453 // Create a loop for each partition except the last. Clone the original 454 // loop before PH along with adding a preheader for the cloned loop. Then 455 // update PH to point to the newly added preheader. 456 BasicBlock *TopPH = OrigPH; 457 unsigned Index = getSize() - 1; 458 for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) { 459 NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 460 461 Part.getVMap()[ExitBlock] = TopPH; 462 Part.remapInstructions(); 463 setNewLoopID(OrigLoopID, &Part); 464 --Index; 465 TopPH = NewLoop->getLoopPreheader(); 466 } 467 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 468 469 // Also set a new loop ID for the last loop. 470 setNewLoopID(OrigLoopID, &PartitionContainer.back()); 471 472 // Now go in forward order and update the immediate dominator for the 473 // preheaders with the exiting block of the previous loop. Dominance 474 // within the loop is updated in cloneLoopWithPreheader. 475 for (auto Curr = PartitionContainer.cbegin(), 476 Next = std::next(PartitionContainer.cbegin()), 477 E = PartitionContainer.cend(); 478 Next != E; ++Curr, ++Next) 479 DT->changeImmediateDominator( 480 Next->getDistributedLoop()->getLoopPreheader(), 481 Curr->getDistributedLoop()->getExitingBlock()); 482 } 483 484 /// Removes the dead instructions from the cloned loops. 485 void removeUnusedInsts() { 486 for (auto &Partition : PartitionContainer) 487 Partition.removeUnusedInsts(); 488 } 489 490 /// For each memory pointer, it computes the partitionId the pointer is 491 /// used in. 492 /// 493 /// This returns an array of int where the I-th entry corresponds to I-th 494 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 495 /// partitions its entry is set to -1. 496 SmallVector<int, 8> 497 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 498 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 499 500 unsigned N = RtPtrCheck->Pointers.size(); 501 SmallVector<int, 8> PtrToPartitions(N); 502 for (unsigned I = 0; I < N; ++I) { 503 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 504 auto Instructions = 505 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 506 507 int &Partition = PtrToPartitions[I]; 508 // First set it to uninitialized. 509 Partition = -2; 510 for (Instruction *Inst : Instructions) { 511 // Note that this could be -1 if Inst is duplicated across multiple 512 // partitions. 513 int ThisPartition = this->InstToPartitionId[Inst]; 514 if (Partition == -2) 515 Partition = ThisPartition; 516 // -1 means belonging to multiple partitions. 517 else if (Partition == -1) 518 break; 519 else if (Partition != (int)ThisPartition) 520 Partition = -1; 521 } 522 assert(Partition != -2 && "Pointer not belonging to any partition"); 523 } 524 525 return PtrToPartitions; 526 } 527 528 void print(raw_ostream &OS) const { 529 unsigned Index = 0; 530 for (const auto &P : PartitionContainer) { 531 OS << "LDist: Partition " << Index++ << ":"; 532 P.print(OS); 533 } 534 } 535 536 void dump() const { print(dbgs()); } 537 538 #ifndef NDEBUG 539 friend raw_ostream &operator<<(raw_ostream &OS, 540 const InstPartitionContainer &Partitions) { 541 Partitions.print(OS); 542 return OS; 543 } 544 #endif 545 546 void printBlocks(raw_ostream &OS) const { 547 unsigned Index = 0; 548 for (const auto &P : PartitionContainer) { 549 OS << "LDist: Partition " << Index++ << ":"; 550 P.printBlocks(OS); 551 } 552 } 553 554 private: 555 using PartitionContainerT = std::list<InstPartition>; 556 557 /// List of partitions. 558 PartitionContainerT PartitionContainer; 559 560 /// Mapping from Instruction to partition Id. If the instruction 561 /// belongs to multiple partitions the entry contains -1. 562 InstToPartitionIdT InstToPartitionId; 563 564 Loop *L; 565 LoopInfo *LI; 566 DominatorTree *DT; 567 568 /// The control structure to merge adjacent partitions if both satisfy 569 /// the \p Predicate. 570 template <class UnaryPredicate> 571 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 572 InstPartition *PrevMatch = nullptr; 573 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 574 auto DoesMatch = Predicate(&*I); 575 if (PrevMatch == nullptr && DoesMatch) { 576 PrevMatch = &*I; 577 ++I; 578 } else if (PrevMatch != nullptr && DoesMatch) { 579 I->moveTo(*PrevMatch); 580 I = PartitionContainer.erase(I); 581 } else { 582 PrevMatch = nullptr; 583 ++I; 584 } 585 } 586 } 587 588 /// Assign new LoopIDs for the partition's cloned loop. 589 void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 590 std::optional<MDNode *> PartitionID = makeFollowupLoopID( 591 OrigLoopID, 592 {LLVMLoopDistributeFollowupAll, 593 Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 594 : LLVMLoopDistributeFollowupCoincident}); 595 if (PartitionID) { 596 Loop *NewLoop = Part->getDistributedLoop(); 597 NewLoop->setLoopID(*PartitionID); 598 } 599 } 600 }; 601 602 /// For each memory instruction, this class maintains difference of the 603 /// number of unsafe dependences that start out from this instruction minus 604 /// those that end here. 605 /// 606 /// By traversing the memory instructions in program order and accumulating this 607 /// number, we know whether any unsafe dependence crosses over a program point. 608 class MemoryInstructionDependences { 609 using Dependence = MemoryDepChecker::Dependence; 610 611 public: 612 struct Entry { 613 Instruction *Inst; 614 unsigned NumUnsafeDependencesStartOrEnd = 0; 615 616 Entry(Instruction *Inst) : Inst(Inst) {} 617 }; 618 619 using AccessesType = SmallVector<Entry, 8>; 620 621 AccessesType::const_iterator begin() const { return Accesses.begin(); } 622 AccessesType::const_iterator end() const { return Accesses.end(); } 623 624 MemoryInstructionDependences( 625 const SmallVectorImpl<Instruction *> &Instructions, 626 const SmallVectorImpl<Dependence> &Dependences) { 627 Accesses.append(Instructions.begin(), Instructions.end()); 628 629 LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n"); 630 for (const auto &Dep : Dependences) 631 if (Dep.isPossiblyBackward()) { 632 // Note that the designations source and destination follow the program 633 // order, i.e. source is always first. (The direction is given by the 634 // DepType.) 635 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 636 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 637 638 LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 639 } 640 } 641 642 private: 643 AccessesType Accesses; 644 }; 645 646 /// The actual class performing the per-loop work. 647 class LoopDistributeForLoop { 648 public: 649 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 650 ScalarEvolution *SE, LoopAccessInfoManager &LAIs, 651 OptimizationRemarkEmitter *ORE) 652 : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) { 653 setForced(); 654 } 655 656 /// Try to distribute an inner-most loop. 657 bool processLoop() { 658 assert(L->isInnermost() && "Only process inner loops."); 659 660 LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '" 661 << L->getHeader()->getParent()->getName() << "' from " 662 << L->getLocStr() << "\n"); 663 664 // Having a single exit block implies there's also one exiting block. 665 if (!L->getExitBlock()) 666 return fail("MultipleExitBlocks", "multiple exit blocks"); 667 if (!L->isLoopSimplifyForm()) 668 return fail("NotLoopSimplifyForm", 669 "loop is not in loop-simplify form"); 670 if (!L->isRotatedForm()) 671 return fail("NotBottomTested", "loop is not bottom tested"); 672 673 BasicBlock *PH = L->getLoopPreheader(); 674 675 LAI = &LAIs.getInfo(*L); 676 677 // Currently, we only distribute to isolate the part of the loop with 678 // dependence cycles to enable partial vectorization. 679 if (LAI->canVectorizeMemory()) 680 return fail("MemOpsCanBeVectorized", 681 "memory operations are safe for vectorization"); 682 683 auto *Dependences = LAI->getDepChecker().getDependences(); 684 if (!Dependences || Dependences->empty()) 685 return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 686 687 LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: " 688 << L->getHeader()->getName() << "\n"); 689 690 InstPartitionContainer Partitions(L, LI, DT); 691 692 // First, go through each memory operation and assign them to consecutive 693 // partitions (the order of partitions follows program order). Put those 694 // with unsafe dependences into "cyclic" partition otherwise put each store 695 // in its own "non-cyclic" partition (we'll merge these later). 696 // 697 // Note that a memory operation (e.g. Load2 below) at a program point that 698 // has an unsafe dependence (Store3->Load1) spanning over it must be 699 // included in the same cyclic partition as the dependent operations. This 700 // is to preserve the original program order after distribution. E.g.: 701 // 702 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 703 // Load1 -. 1 0->1 704 // Load2 | /Unsafe/ 0 1 705 // Store3 -' -1 1->0 706 // Load4 0 0 707 // 708 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 709 // we just keep assigning to the same cyclic partition until 710 // NumUnsafeDependencesActive reaches 0. 711 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 712 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 713 *Dependences); 714 715 int NumUnsafeDependencesActive = 0; 716 for (const auto &InstDep : MID) { 717 Instruction *I = InstDep.Inst; 718 // We update NumUnsafeDependencesActive post-instruction, catch the 719 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 720 if (NumUnsafeDependencesActive || 721 InstDep.NumUnsafeDependencesStartOrEnd > 0) 722 Partitions.addToCyclicPartition(I); 723 else 724 Partitions.addToNewNonCyclicPartition(I); 725 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 726 assert(NumUnsafeDependencesActive >= 0 && 727 "Negative number of dependences active"); 728 } 729 730 // Add partitions for values used outside. These partitions can be out of 731 // order from the original program order. This is OK because if the 732 // partition uses a load we will merge this partition with the original 733 // partition of the load that we set up in the previous loop (see 734 // mergeToAvoidDuplicatedLoads). 735 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 736 for (auto *Inst : DefsUsedOutside) 737 Partitions.addToNewNonCyclicPartition(Inst); 738 739 LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions); 740 if (Partitions.getSize() < 2) 741 return fail("CantIsolateUnsafeDeps", 742 "cannot isolate unsafe dependencies"); 743 744 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 745 // should be able to vectorize these together. 746 Partitions.mergeBeforePopulating(); 747 LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions); 748 if (Partitions.getSize() < 2) 749 return fail("CantIsolateUnsafeDeps", 750 "cannot isolate unsafe dependencies"); 751 752 // Now, populate the partitions with non-memory operations. 753 Partitions.populateUsedSet(); 754 LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions); 755 756 // In order to preserve original lexical order for loads, keep them in the 757 // partition that we set up in the MemoryInstructionDependences loop. 758 if (Partitions.mergeToAvoidDuplicatedLoads()) { 759 LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n" 760 << Partitions); 761 if (Partitions.getSize() < 2) 762 return fail("CantIsolateUnsafeDeps", 763 "cannot isolate unsafe dependencies"); 764 } 765 766 // Don't distribute the loop if we need too many SCEV run-time checks, or 767 // any if it's illegal. 768 const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); 769 if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 770 return fail("RuntimeCheckWithConvergent", 771 "may not insert runtime check with convergent operation"); 772 } 773 774 if (Pred.getComplexity() > (IsForced.value_or(false) 775 ? PragmaDistributeSCEVCheckThreshold 776 : DistributeSCEVCheckThreshold)) 777 return fail("TooManySCEVRuntimeChecks", 778 "too many SCEV run-time checks needed.\n"); 779 780 if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) 781 return fail("HeuristicDisabled", "distribution heuristic disabled"); 782 783 LLVM_DEBUG(dbgs() << "LDist: Distributing loop: " 784 << L->getHeader()->getName() << "\n"); 785 // We're done forming the partitions set up the reverse mapping from 786 // instructions to partitions. 787 Partitions.setupPartitionIdOnInstructions(); 788 789 // If we need run-time checks, version the loop now. 790 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 791 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 792 const auto &AllChecks = RtPtrChecking->getChecks(); 793 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 794 RtPtrChecking); 795 796 if (LAI->hasConvergentOp() && !Checks.empty()) { 797 return fail("RuntimeCheckWithConvergent", 798 "may not insert runtime check with convergent operation"); 799 } 800 801 // To keep things simple have an empty preheader before we version or clone 802 // the loop. (Also split if this has no predecessor, i.e. entry, because we 803 // rely on PH having a predecessor.) 804 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 805 SplitBlock(PH, PH->getTerminator(), DT, LI); 806 807 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 808 assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 809 810 MDNode *OrigLoopID = L->getLoopID(); 811 812 LLVM_DEBUG(dbgs() << "LDist: Pointers:\n"); 813 LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 814 LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); 815 LVer.versionLoop(DefsUsedOutside); 816 LVer.annotateLoopWithNoAlias(); 817 818 // The unversioned loop will not be changed, so we inherit all attributes 819 // from the original loop, but remove the loop distribution metadata to 820 // avoid to distribute it again. 821 MDNode *UnversionedLoopID = *makeFollowupLoopID( 822 OrigLoopID, 823 {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, 824 "llvm.loop.distribute.", true); 825 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 826 } 827 828 // Create identical copies of the original loop for each partition and hook 829 // them up sequentially. 830 Partitions.cloneLoops(); 831 832 // Now, we remove the instruction from each loop that don't belong to that 833 // partition. 834 Partitions.removeUnusedInsts(); 835 LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n"); 836 LLVM_DEBUG(Partitions.printBlocks(dbgs())); 837 838 if (LDistVerify) { 839 LI->verify(*DT); 840 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 841 } 842 843 ++NumLoopsDistributed; 844 // Report the success. 845 ORE->emit([&]() { 846 return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 847 L->getHeader()) 848 << "distributed loop"; 849 }); 850 return true; 851 } 852 853 /// Provide diagnostics then \return with false. 854 bool fail(StringRef RemarkName, StringRef Message) { 855 LLVMContext &Ctx = F->getContext(); 856 bool Forced = isForced().value_or(false); 857 858 LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n"); 859 860 // With Rpass-missed report that distribution failed. 861 ORE->emit([&]() { 862 return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 863 L->getStartLoc(), L->getHeader()) 864 << "loop not distributed: use -Rpass-analysis=loop-distribute for " 865 "more " 866 "info"; 867 }); 868 869 // With Rpass-analysis report why. This is on by default if distribution 870 // was requested explicitly. 871 ORE->emit(OptimizationRemarkAnalysis( 872 Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 873 RemarkName, L->getStartLoc(), L->getHeader()) 874 << "loop not distributed: " << Message); 875 876 // Also issue a warning if distribution was requested explicitly but it 877 // failed. 878 if (Forced) 879 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 880 *F, L->getStartLoc(), "loop not distributed: failed " 881 "explicitly specified loop distribution")); 882 883 return false; 884 } 885 886 /// Return if distribution forced to be enabled/disabled for the loop. 887 /// 888 /// If the optional has a value, it indicates whether distribution was forced 889 /// to be enabled (true) or disabled (false). If the optional has no value 890 /// distribution was not forced either way. 891 const std::optional<bool> &isForced() const { return IsForced; } 892 893 private: 894 /// Filter out checks between pointers from the same partition. 895 /// 896 /// \p PtrToPartition contains the partition number for pointers. Partition 897 /// number -1 means that the pointer is used in multiple partitions. In this 898 /// case we can't safely omit the check. 899 SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 900 const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 901 const SmallVectorImpl<int> &PtrToPartition, 902 const RuntimePointerChecking *RtPtrChecking) { 903 SmallVector<RuntimePointerCheck, 4> Checks; 904 905 copy_if(AllChecks, std::back_inserter(Checks), 906 [&](const RuntimePointerCheck &Check) { 907 for (unsigned PtrIdx1 : Check.first->Members) 908 for (unsigned PtrIdx2 : Check.second->Members) 909 // Only include this check if there is a pair of pointers 910 // that require checking and the pointers fall into 911 // separate partitions. 912 // 913 // (Note that we already know at this point that the two 914 // pointer groups need checking but it doesn't follow 915 // that each pair of pointers within the two groups need 916 // checking as well. 917 // 918 // In other words we don't want to include a check just 919 // because there is a pair of pointers between the two 920 // pointer groups that require checks and a different 921 // pair whose pointers fall into different partitions.) 922 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 923 !RuntimePointerChecking::arePointersInSamePartition( 924 PtrToPartition, PtrIdx1, PtrIdx2)) 925 return true; 926 return false; 927 }); 928 929 return Checks; 930 } 931 932 /// Check whether the loop metadata is forcing distribution to be 933 /// enabled/disabled. 934 void setForced() { 935 std::optional<const MDOperand *> Value = 936 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 937 if (!Value) 938 return; 939 940 const MDOperand *Op = *Value; 941 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 942 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 943 } 944 945 Loop *L; 946 Function *F; 947 948 // Analyses used. 949 LoopInfo *LI; 950 const LoopAccessInfo *LAI = nullptr; 951 DominatorTree *DT; 952 ScalarEvolution *SE; 953 LoopAccessInfoManager &LAIs; 954 OptimizationRemarkEmitter *ORE; 955 956 /// Indicates whether distribution is forced to be enabled/disabled for 957 /// the loop. 958 /// 959 /// If the optional has a value, it indicates whether distribution was forced 960 /// to be enabled (true) or disabled (false). If the optional has no value 961 /// distribution was not forced either way. 962 std::optional<bool> IsForced; 963 }; 964 965 } // end anonymous namespace 966 967 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 968 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 969 LoopAccessInfoManager &LAIs) { 970 // Build up a worklist of inner-loops to distribute. This is necessary as the 971 // act of distributing a loop creates new loops and can invalidate iterators 972 // across the loops. 973 SmallVector<Loop *, 8> Worklist; 974 975 for (Loop *TopLevelLoop : *LI) 976 for (Loop *L : depth_first(TopLevelLoop)) 977 // We only handle inner-most loops. 978 if (L->isInnermost()) 979 Worklist.push_back(L); 980 981 // Now walk the identified inner loops. 982 bool Changed = false; 983 for (Loop *L : Worklist) { 984 LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE); 985 986 // If distribution was forced for the specific loop to be 987 // enabled/disabled, follow that. Otherwise use the global flag. 988 if (LDL.isForced().value_or(EnableLoopDistribute)) 989 Changed |= LDL.processLoop(); 990 } 991 992 // Process each loop nest in the function. 993 return Changed; 994 } 995 996 PreservedAnalyses LoopDistributePass::run(Function &F, 997 FunctionAnalysisManager &AM) { 998 auto &LI = AM.getResult<LoopAnalysis>(F); 999 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1000 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1001 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1002 1003 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 1004 bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs); 1005 if (!Changed) 1006 return PreservedAnalyses::all(); 1007 PreservedAnalyses PA; 1008 PA.preserve<LoopAnalysis>(); 1009 PA.preserve<DominatorTreeAnalysis>(); 1010 return PA; 1011 } 1012