1 //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Loops should be simplified before this analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/BlockFrequencyInfoImpl.h" 14 #include "llvm/ADT/APInt.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/GraphTraits.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/SCCIterator.h" 19 #include "llvm/Config/llvm-config.h" 20 #include "llvm/IR/Function.h" 21 #include "llvm/Support/BlockFrequency.h" 22 #include "llvm/Support/BranchProbability.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/Debug.h" 25 #include "llvm/Support/ScaledNumber.h" 26 #include "llvm/Support/MathExtras.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include <algorithm> 29 #include <cassert> 30 #include <cstddef> 31 #include <cstdint> 32 #include <iterator> 33 #include <list> 34 #include <numeric> 35 #include <utility> 36 #include <vector> 37 38 using namespace llvm; 39 using namespace llvm::bfi_detail; 40 41 #define DEBUG_TYPE "block-freq" 42 43 namespace llvm { 44 cl::opt<bool> CheckBFIUnknownBlockQueries( 45 "check-bfi-unknown-block-queries", 46 cl::init(false), cl::Hidden, 47 cl::desc("Check if block frequency is queried for an unknown block " 48 "for debugging missed BFI updates")); 49 } 50 51 ScaledNumber<uint64_t> BlockMass::toScaled() const { 52 if (isFull()) 53 return ScaledNumber<uint64_t>(1, 0); 54 return ScaledNumber<uint64_t>(getMass() + 1, -64); 55 } 56 57 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 58 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); } 59 #endif 60 61 static char getHexDigit(int N) { 62 assert(N < 16); 63 if (N < 10) 64 return '0' + N; 65 return 'a' + N - 10; 66 } 67 68 raw_ostream &BlockMass::print(raw_ostream &OS) const { 69 for (int Digits = 0; Digits < 16; ++Digits) 70 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf); 71 return OS; 72 } 73 74 namespace { 75 76 using BlockNode = BlockFrequencyInfoImplBase::BlockNode; 77 using Distribution = BlockFrequencyInfoImplBase::Distribution; 78 using WeightList = BlockFrequencyInfoImplBase::Distribution::WeightList; 79 using Scaled64 = BlockFrequencyInfoImplBase::Scaled64; 80 using LoopData = BlockFrequencyInfoImplBase::LoopData; 81 using Weight = BlockFrequencyInfoImplBase::Weight; 82 using FrequencyData = BlockFrequencyInfoImplBase::FrequencyData; 83 84 /// Dithering mass distributer. 85 /// 86 /// This class splits up a single mass into portions by weight, dithering to 87 /// spread out error. No mass is lost. The dithering precision depends on the 88 /// precision of the product of \a BlockMass and \a BranchProbability. 89 /// 90 /// The distribution algorithm follows. 91 /// 92 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the 93 /// mass to distribute in \a RemMass. 94 /// 95 /// 2. For each portion: 96 /// 97 /// 1. Construct a branch probability, P, as the portion's weight divided 98 /// by the current value of \a RemWeight. 99 /// 2. Calculate the portion's mass as \a RemMass times P. 100 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting 101 /// the current portion's weight and mass. 102 struct DitheringDistributer { 103 uint32_t RemWeight; 104 BlockMass RemMass; 105 106 DitheringDistributer(Distribution &Dist, const BlockMass &Mass); 107 108 BlockMass takeMass(uint32_t Weight); 109 }; 110 111 } // end anonymous namespace 112 113 DitheringDistributer::DitheringDistributer(Distribution &Dist, 114 const BlockMass &Mass) { 115 Dist.normalize(); 116 RemWeight = Dist.Total; 117 RemMass = Mass; 118 } 119 120 BlockMass DitheringDistributer::takeMass(uint32_t Weight) { 121 assert(Weight && "invalid weight"); 122 assert(Weight <= RemWeight); 123 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight); 124 125 // Decrement totals (dither). 126 RemWeight -= Weight; 127 RemMass -= Mass; 128 return Mass; 129 } 130 131 void Distribution::add(const BlockNode &Node, uint64_t Amount, 132 Weight::DistType Type) { 133 assert(Amount && "invalid weight of 0"); 134 uint64_t NewTotal = Total + Amount; 135 136 // Check for overflow. It should be impossible to overflow twice. 137 bool IsOverflow = NewTotal < Total; 138 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow"); 139 DidOverflow |= IsOverflow; 140 141 // Update the total. 142 Total = NewTotal; 143 144 // Save the weight. 145 Weights.push_back(Weight(Type, Node, Amount)); 146 } 147 148 static void combineWeight(Weight &W, const Weight &OtherW) { 149 assert(OtherW.TargetNode.isValid()); 150 if (!W.Amount) { 151 W = OtherW; 152 return; 153 } 154 assert(W.Type == OtherW.Type); 155 assert(W.TargetNode == OtherW.TargetNode); 156 assert(OtherW.Amount && "Expected non-zero weight"); 157 if (W.Amount > W.Amount + OtherW.Amount) 158 // Saturate on overflow. 159 W.Amount = UINT64_MAX; 160 else 161 W.Amount += OtherW.Amount; 162 } 163 164 static void combineWeightsBySorting(WeightList &Weights) { 165 // Sort so edges to the same node are adjacent. 166 llvm::sort(Weights, [](const Weight &L, const Weight &R) { 167 return L.TargetNode < R.TargetNode; 168 }); 169 170 // Combine adjacent edges. 171 WeightList::iterator O = Weights.begin(); 172 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E; 173 ++O, (I = L)) { 174 *O = *I; 175 176 // Find the adjacent weights to the same node. 177 for (++L; L != E && I->TargetNode == L->TargetNode; ++L) 178 combineWeight(*O, *L); 179 } 180 181 // Erase extra entries. 182 Weights.erase(O, Weights.end()); 183 } 184 185 static void combineWeightsByHashing(WeightList &Weights) { 186 // Collect weights into a DenseMap. 187 using HashTable = DenseMap<BlockNode::IndexType, Weight>; 188 189 HashTable Combined(NextPowerOf2(2 * Weights.size())); 190 for (const Weight &W : Weights) 191 combineWeight(Combined[W.TargetNode.Index], W); 192 193 // Check whether anything changed. 194 if (Weights.size() == Combined.size()) 195 return; 196 197 // Fill in the new weights. 198 Weights.clear(); 199 Weights.reserve(Combined.size()); 200 for (const auto &I : Combined) 201 Weights.push_back(I.second); 202 } 203 204 static void combineWeights(WeightList &Weights) { 205 // Use a hash table for many successors to keep this linear. 206 if (Weights.size() > 128) { 207 combineWeightsByHashing(Weights); 208 return; 209 } 210 211 combineWeightsBySorting(Weights); 212 } 213 214 static uint64_t shiftRightAndRound(uint64_t N, int Shift) { 215 assert(Shift >= 0); 216 assert(Shift < 64); 217 if (!Shift) 218 return N; 219 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1)); 220 } 221 222 void Distribution::normalize() { 223 // Early exit for termination nodes. 224 if (Weights.empty()) 225 return; 226 227 // Only bother if there are multiple successors. 228 if (Weights.size() > 1) 229 combineWeights(Weights); 230 231 // Early exit when combined into a single successor. 232 if (Weights.size() == 1) { 233 Total = 1; 234 Weights.front().Amount = 1; 235 return; 236 } 237 238 // Determine how much to shift right so that the total fits into 32-bits. 239 // 240 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1 241 // for each weight can cause a 32-bit overflow. 242 int Shift = 0; 243 if (DidOverflow) 244 Shift = 33; 245 else if (Total > UINT32_MAX) 246 Shift = 33 - countLeadingZeros(Total); 247 248 // Early exit if nothing needs to be scaled. 249 if (!Shift) { 250 // If we didn't overflow then combineWeights() shouldn't have changed the 251 // sum of the weights, but let's double-check. 252 assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), 253 [](uint64_t Sum, const Weight &W) { 254 return Sum + W.Amount; 255 }) && 256 "Expected total to be correct"); 257 return; 258 } 259 260 // Recompute the total through accumulation (rather than shifting it) so that 261 // it's accurate after shifting and any changes combineWeights() made above. 262 Total = 0; 263 264 // Sum the weights to each node and shift right if necessary. 265 for (Weight &W : Weights) { 266 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we 267 // can round here without concern about overflow. 268 assert(W.TargetNode.isValid()); 269 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift)); 270 assert(W.Amount <= UINT32_MAX); 271 272 // Update the total. 273 Total += W.Amount; 274 } 275 assert(Total <= UINT32_MAX); 276 } 277 278 void BlockFrequencyInfoImplBase::clear() { 279 // Swap with a default-constructed std::vector, since std::vector<>::clear() 280 // does not actually clear heap storage. 281 std::vector<FrequencyData>().swap(Freqs); 282 IsIrrLoopHeader.clear(); 283 std::vector<WorkingData>().swap(Working); 284 Loops.clear(); 285 } 286 287 /// Clear all memory not needed downstream. 288 /// 289 /// Releases all memory not used downstream. In particular, saves Freqs. 290 static void cleanup(BlockFrequencyInfoImplBase &BFI) { 291 std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs)); 292 SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); 293 BFI.clear(); 294 BFI.Freqs = std::move(SavedFreqs); 295 BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); 296 } 297 298 bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, 299 const LoopData *OuterLoop, 300 const BlockNode &Pred, 301 const BlockNode &Succ, 302 uint64_t Weight) { 303 if (!Weight) 304 Weight = 1; 305 306 auto isLoopHeader = [&OuterLoop](const BlockNode &Node) { 307 return OuterLoop && OuterLoop->isHeader(Node); 308 }; 309 310 BlockNode Resolved = Working[Succ.Index].getResolvedNode(); 311 312 #ifndef NDEBUG 313 auto debugSuccessor = [&](const char *Type) { 314 dbgs() << " =>" 315 << " [" << Type << "] weight = " << Weight; 316 if (!isLoopHeader(Resolved)) 317 dbgs() << ", succ = " << getBlockName(Succ); 318 if (Resolved != Succ) 319 dbgs() << ", resolved = " << getBlockName(Resolved); 320 dbgs() << "\n"; 321 }; 322 (void)debugSuccessor; 323 #endif 324 325 if (isLoopHeader(Resolved)) { 326 LLVM_DEBUG(debugSuccessor("backedge")); 327 Dist.addBackedge(Resolved, Weight); 328 return true; 329 } 330 331 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { 332 LLVM_DEBUG(debugSuccessor(" exit ")); 333 Dist.addExit(Resolved, Weight); 334 return true; 335 } 336 337 if (Resolved < Pred) { 338 if (!isLoopHeader(Pred)) { 339 // If OuterLoop is an irreducible loop, we can't actually handle this. 340 assert((!OuterLoop || !OuterLoop->isIrreducible()) && 341 "unhandled irreducible control flow"); 342 343 // Irreducible backedge. Abort. 344 LLVM_DEBUG(debugSuccessor("abort!!!")); 345 return false; 346 } 347 348 // If "Pred" is a loop header, then this isn't really a backedge; rather, 349 // OuterLoop must be irreducible. These false backedges can come only from 350 // secondary loop headers. 351 assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) && 352 "unhandled irreducible control flow"); 353 } 354 355 LLVM_DEBUG(debugSuccessor(" local ")); 356 Dist.addLocal(Resolved, Weight); 357 return true; 358 } 359 360 bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist( 361 const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) { 362 // Copy the exit map into Dist. 363 for (const auto &I : Loop.Exits) 364 if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, 365 I.second.getMass())) 366 // Irreducible backedge. 367 return false; 368 369 return true; 370 } 371 372 /// Compute the loop scale for a loop. 373 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) { 374 // Compute loop scale. 375 LLVM_DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n"); 376 377 // Infinite loops need special handling. If we give the back edge an infinite 378 // mass, they may saturate all the other scales in the function down to 1, 379 // making all the other region temperatures look exactly the same. Choose an 380 // arbitrary scale to avoid these issues. 381 // 382 // FIXME: An alternate way would be to select a symbolic scale which is later 383 // replaced to be the maximum of all computed scales plus 1. This would 384 // appropriately describe the loop as having a large scale, without skewing 385 // the final frequency computation. 386 const Scaled64 InfiniteLoopScale(1, 12); 387 388 // LoopScale == 1 / ExitMass 389 // ExitMass == HeadMass - BackedgeMass 390 BlockMass TotalBackedgeMass; 391 for (auto &Mass : Loop.BackedgeMass) 392 TotalBackedgeMass += Mass; 393 BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; 394 395 // Block scale stores the inverse of the scale. If this is an infinite loop, 396 // its exit mass will be zero. In this case, use an arbitrary scale for the 397 // loop scale. 398 Loop.Scale = 399 ExitMass.isEmpty() ? InfiniteLoopScale : ExitMass.toScaled().inverse(); 400 401 LLVM_DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" 402 << BlockMass::getFull() << " - " << TotalBackedgeMass 403 << ")\n" 404 << " - scale = " << Loop.Scale << "\n"); 405 } 406 407 /// Package up a loop. 408 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) { 409 LLVM_DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n"); 410 411 // Clear the subloop exits to prevent quadratic memory usage. 412 for (const BlockNode &M : Loop.Nodes) { 413 if (auto *Loop = Working[M.Index].getPackagedLoop()) 414 Loop->Exits.clear(); 415 LLVM_DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); 416 } 417 Loop.IsPackaged = true; 418 } 419 420 #ifndef NDEBUG 421 static void debugAssign(const BlockFrequencyInfoImplBase &BFI, 422 const DitheringDistributer &D, const BlockNode &T, 423 const BlockMass &M, const char *Desc) { 424 dbgs() << " => assign " << M << " (" << D.RemMass << ")"; 425 if (Desc) 426 dbgs() << " [" << Desc << "]"; 427 if (T.isValid()) 428 dbgs() << " to " << BFI.getBlockName(T); 429 dbgs() << "\n"; 430 } 431 #endif 432 433 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, 434 LoopData *OuterLoop, 435 Distribution &Dist) { 436 BlockMass Mass = Working[Source.Index].getMass(); 437 LLVM_DEBUG(dbgs() << " => mass: " << Mass << "\n"); 438 439 // Distribute mass to successors as laid out in Dist. 440 DitheringDistributer D(Dist, Mass); 441 442 for (const Weight &W : Dist.Weights) { 443 // Check for a local edge (non-backedge and non-exit). 444 BlockMass Taken = D.takeMass(W.Amount); 445 if (W.Type == Weight::Local) { 446 Working[W.TargetNode.Index].getMass() += Taken; 447 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 448 continue; 449 } 450 451 // Backedges and exits only make sense if we're processing a loop. 452 assert(OuterLoop && "backedge or exit outside of loop"); 453 454 // Check for a backedge. 455 if (W.Type == Weight::Backedge) { 456 OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; 457 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); 458 continue; 459 } 460 461 // This must be an exit. 462 assert(W.Type == Weight::Exit); 463 OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); 464 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); 465 } 466 } 467 468 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, 469 const Scaled64 &Min, const Scaled64 &Max) { 470 // Scale the Factor to a size that creates integers. Ideally, integers would 471 // be scaled so that Max == UINT64_MAX so that they can be best 472 // differentiated. However, in the presence of large frequency values, small 473 // frequencies are scaled down to 1, making it impossible to differentiate 474 // small, unequal numbers. When the spread between Min and Max frequencies 475 // fits well within MaxBits, we make the scale be at least 8. 476 const unsigned MaxBits = 64; 477 const unsigned SpreadBits = (Max / Min).lg(); 478 Scaled64 ScalingFactor; 479 if (SpreadBits <= MaxBits - 3) { 480 // If the values are small enough, make the scaling factor at least 8 to 481 // allow distinguishing small values. 482 ScalingFactor = Min.inverse(); 483 ScalingFactor <<= 3; 484 } else { 485 // If the values need more than MaxBits to be represented, saturate small 486 // frequency values down to 1 by using a scaling factor that benefits large 487 // frequency values. 488 ScalingFactor = Scaled64(1, MaxBits) / Max; 489 } 490 491 // Translate the floats to integers. 492 LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max 493 << ", factor = " << ScalingFactor << "\n"); 494 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) { 495 Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor; 496 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>()); 497 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = " 498 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled 499 << ", int = " << BFI.Freqs[Index].Integer << "\n"); 500 } 501 } 502 503 /// Unwrap a loop package. 504 /// 505 /// Visits all the members of a loop, adjusting their BlockData according to 506 /// the loop's pseudo-node. 507 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) { 508 LLVM_DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop) 509 << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale 510 << "\n"); 511 Loop.Scale *= Loop.Mass.toScaled(); 512 Loop.IsPackaged = false; 513 LLVM_DEBUG(dbgs() << " => combined-scale = " << Loop.Scale << "\n"); 514 515 // Propagate the head scale through the loop. Since members are visited in 516 // RPO, the head scale will be updated by the loop scale first, and then the 517 // final head scale will be used for updated the rest of the members. 518 for (const BlockNode &N : Loop.Nodes) { 519 const auto &Working = BFI.Working[N.Index]; 520 Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale 521 : BFI.Freqs[N.Index].Scaled; 522 Scaled64 New = Loop.Scale * F; 523 LLVM_DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " 524 << New << "\n"); 525 F = New; 526 } 527 } 528 529 void BlockFrequencyInfoImplBase::unwrapLoops() { 530 // Set initial frequencies from loop-local masses. 531 for (size_t Index = 0; Index < Working.size(); ++Index) 532 Freqs[Index].Scaled = Working[Index].Mass.toScaled(); 533 534 for (LoopData &Loop : Loops) 535 unwrapLoop(*this, Loop); 536 } 537 538 void BlockFrequencyInfoImplBase::finalizeMetrics() { 539 // Unwrap loop packages in reverse post-order, tracking min and max 540 // frequencies. 541 auto Min = Scaled64::getLargest(); 542 auto Max = Scaled64::getZero(); 543 for (size_t Index = 0; Index < Working.size(); ++Index) { 544 // Update min/max scale. 545 Min = std::min(Min, Freqs[Index].Scaled); 546 Max = std::max(Max, Freqs[Index].Scaled); 547 } 548 549 // Convert to integers. 550 convertFloatingToInteger(*this, Min, Max); 551 552 // Clean up data structures. 553 cleanup(*this); 554 555 // Print out the final stats. 556 LLVM_DEBUG(dump()); 557 } 558 559 BlockFrequency 560 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const { 561 if (!Node.isValid()) { 562 #ifndef NDEBUG 563 if (CheckBFIUnknownBlockQueries) { 564 SmallString<256> Msg; 565 raw_svector_ostream OS(Msg); 566 OS << "*** Detected BFI query for unknown block " << getBlockName(Node); 567 report_fatal_error(OS.str()); 568 } 569 #endif 570 return 0; 571 } 572 return Freqs[Node.Index].Integer; 573 } 574 575 Optional<uint64_t> 576 BlockFrequencyInfoImplBase::getBlockProfileCount(const Function &F, 577 const BlockNode &Node, 578 bool AllowSynthetic) const { 579 return getProfileCountFromFreq(F, getBlockFreq(Node).getFrequency(), 580 AllowSynthetic); 581 } 582 583 Optional<uint64_t> 584 BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, 585 uint64_t Freq, 586 bool AllowSynthetic) const { 587 auto EntryCount = F.getEntryCount(AllowSynthetic); 588 if (!EntryCount) 589 return None; 590 // Use 128 bit APInt to do the arithmetic to avoid overflow. 591 APInt BlockCount(128, EntryCount.getCount()); 592 APInt BlockFreq(128, Freq); 593 APInt EntryFreq(128, getEntryFreq()); 594 BlockCount *= BlockFreq; 595 // Rounded division of BlockCount by EntryFreq. Since EntryFreq is unsigned 596 // lshr by 1 gives EntryFreq/2. 597 BlockCount = (BlockCount + EntryFreq.lshr(1)).udiv(EntryFreq); 598 return BlockCount.getLimitedValue(); 599 } 600 601 bool 602 BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { 603 if (!Node.isValid()) 604 return false; 605 return IsIrrLoopHeader.test(Node.Index); 606 } 607 608 Scaled64 609 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { 610 if (!Node.isValid()) 611 return Scaled64::getZero(); 612 return Freqs[Node.Index].Scaled; 613 } 614 615 void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, 616 uint64_t Freq) { 617 assert(Node.isValid() && "Expected valid node"); 618 assert(Node.Index < Freqs.size() && "Expected legal index"); 619 Freqs[Node.Index].Integer = Freq; 620 } 621 622 std::string 623 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { 624 return {}; 625 } 626 627 std::string 628 BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const { 629 return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*"); 630 } 631 632 raw_ostream & 633 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 634 const BlockNode &Node) const { 635 return OS << getFloatingBlockFreq(Node); 636 } 637 638 raw_ostream & 639 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS, 640 const BlockFrequency &Freq) const { 641 Scaled64 Block(Freq.getFrequency(), 0); 642 Scaled64 Entry(getEntryFreq(), 0); 643 644 return OS << Block / Entry; 645 } 646 647 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) { 648 Start = OuterLoop.getHeader(); 649 Nodes.reserve(OuterLoop.Nodes.size()); 650 for (auto N : OuterLoop.Nodes) 651 addNode(N); 652 indexNodes(); 653 } 654 655 void IrreducibleGraph::addNodesInFunction() { 656 Start = 0; 657 for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index) 658 if (!BFI.Working[Index].isPackaged()) 659 addNode(Index); 660 indexNodes(); 661 } 662 663 void IrreducibleGraph::indexNodes() { 664 for (auto &I : Nodes) 665 Lookup[I.Node.Index] = &I; 666 } 667 668 void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, 669 const BFIBase::LoopData *OuterLoop) { 670 if (OuterLoop && OuterLoop->isHeader(Succ)) 671 return; 672 auto L = Lookup.find(Succ.Index); 673 if (L == Lookup.end()) 674 return; 675 IrrNode &SuccIrr = *L->second; 676 Irr.Edges.push_back(&SuccIrr); 677 SuccIrr.Edges.push_front(&Irr); 678 ++SuccIrr.NumIn; 679 } 680 681 namespace llvm { 682 683 template <> struct GraphTraits<IrreducibleGraph> { 684 using GraphT = bfi_detail::IrreducibleGraph; 685 using NodeRef = const GraphT::IrrNode *; 686 using ChildIteratorType = GraphT::IrrNode::iterator; 687 688 static NodeRef getEntryNode(const GraphT &G) { return G.StartIrr; } 689 static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } 690 static ChildIteratorType child_end(NodeRef N) { return N->succ_end(); } 691 }; 692 693 } // end namespace llvm 694 695 /// Find extra irreducible headers. 696 /// 697 /// Find entry blocks and other blocks with backedges, which exist when \c G 698 /// contains irreducible sub-SCCs. 699 static void findIrreducibleHeaders( 700 const BlockFrequencyInfoImplBase &BFI, 701 const IrreducibleGraph &G, 702 const std::vector<const IrreducibleGraph::IrrNode *> &SCC, 703 LoopData::NodeList &Headers, LoopData::NodeList &Others) { 704 // Map from nodes in the SCC to whether it's an entry block. 705 SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC; 706 707 // InSCC also acts the set of nodes in the graph. Seed it. 708 for (const auto *I : SCC) 709 InSCC[I] = false; 710 711 for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { 712 auto &Irr = *I->first; 713 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 714 if (InSCC.count(P)) 715 continue; 716 717 // This is an entry block. 718 I->second = true; 719 Headers.push_back(Irr.Node); 720 LLVM_DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) 721 << "\n"); 722 break; 723 } 724 } 725 assert(Headers.size() >= 2 && 726 "Expected irreducible CFG; -loop-info is likely invalid"); 727 if (Headers.size() == InSCC.size()) { 728 // Every block is a header. 729 llvm::sort(Headers); 730 return; 731 } 732 733 // Look for extra headers from irreducible sub-SCCs. 734 for (const auto &I : InSCC) { 735 // Entry blocks are already headers. 736 if (I.second) 737 continue; 738 739 auto &Irr = *I.first; 740 for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { 741 // Skip forward edges. 742 if (P->Node < Irr.Node) 743 continue; 744 745 // Skip predecessors from entry blocks. These can have inverted 746 // ordering. 747 if (InSCC.lookup(P)) 748 continue; 749 750 // Store the extra header. 751 Headers.push_back(Irr.Node); 752 LLVM_DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) 753 << "\n"); 754 break; 755 } 756 if (Headers.back() == Irr.Node) 757 // Added this as a header. 758 continue; 759 760 // This is not a header. 761 Others.push_back(Irr.Node); 762 LLVM_DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); 763 } 764 llvm::sort(Headers); 765 llvm::sort(Others); 766 } 767 768 static void createIrreducibleLoop( 769 BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, 770 LoopData *OuterLoop, std::list<LoopData>::iterator Insert, 771 const std::vector<const IrreducibleGraph::IrrNode *> &SCC) { 772 // Translate the SCC into RPO. 773 LLVM_DEBUG(dbgs() << " - found-scc\n"); 774 775 LoopData::NodeList Headers; 776 LoopData::NodeList Others; 777 findIrreducibleHeaders(BFI, G, SCC, Headers, Others); 778 779 auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), 780 Headers.end(), Others.begin(), Others.end()); 781 782 // Update loop hierarchy. 783 for (const auto &N : Loop->Nodes) 784 if (BFI.Working[N.Index].isLoopHeader()) 785 BFI.Working[N.Index].Loop->Parent = &*Loop; 786 else 787 BFI.Working[N.Index].Loop = &*Loop; 788 } 789 790 iterator_range<std::list<LoopData>::iterator> 791 BlockFrequencyInfoImplBase::analyzeIrreducible( 792 const IrreducibleGraph &G, LoopData *OuterLoop, 793 std::list<LoopData>::iterator Insert) { 794 assert((OuterLoop == nullptr) == (Insert == Loops.begin())); 795 auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); 796 797 for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { 798 if (I->size() < 2) 799 continue; 800 801 // Translate the SCC into RPO. 802 createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); 803 } 804 805 if (OuterLoop) 806 return make_range(std::next(Prev), Insert); 807 return make_range(Loops.begin(), Insert); 808 } 809 810 void 811 BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { 812 OuterLoop.Exits.clear(); 813 for (auto &Mass : OuterLoop.BackedgeMass) 814 Mass = BlockMass::getEmpty(); 815 auto O = OuterLoop.Nodes.begin() + 1; 816 for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) 817 if (!Working[I->Index].isPackaged()) 818 *O++ = *I; 819 OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); 820 } 821 822 void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { 823 assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); 824 825 // Since the loop has more than one header block, the mass flowing back into 826 // each header will be different. Adjust the mass in each header loop to 827 // reflect the masses flowing through back edges. 828 // 829 // To do this, we distribute the initial mass using the backedge masses 830 // as weights for the distribution. 831 BlockMass LoopMass = BlockMass::getFull(); 832 Distribution Dist; 833 834 LLVM_DEBUG(dbgs() << "adjust-loop-header-mass:\n"); 835 for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { 836 auto &HeaderNode = Loop.Nodes[H]; 837 auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; 838 LLVM_DEBUG(dbgs() << " - Add back edge mass for node " 839 << getBlockName(HeaderNode) << ": " << BackedgeMass 840 << "\n"); 841 if (BackedgeMass.getMass() > 0) 842 Dist.addLocal(HeaderNode, BackedgeMass.getMass()); 843 else 844 LLVM_DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); 845 } 846 847 DitheringDistributer D(Dist, LoopMass); 848 849 LLVM_DEBUG(dbgs() << " Distribute loop mass " << LoopMass 850 << " to headers using above weights\n"); 851 for (const Weight &W : Dist.Weights) { 852 BlockMass Taken = D.takeMass(W.Amount); 853 assert(W.Type == Weight::Local && "all weights should be local"); 854 Working[W.TargetNode.Index].getMass() = Taken; 855 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 856 } 857 } 858 859 void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { 860 BlockMass LoopMass = BlockMass::getFull(); 861 DitheringDistributer D(Dist, LoopMass); 862 for (const Weight &W : Dist.Weights) { 863 BlockMass Taken = D.takeMass(W.Amount); 864 assert(W.Type == Weight::Local && "all weights should be local"); 865 Working[W.TargetNode.Index].getMass() = Taken; 866 LLVM_DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); 867 } 868 } 869