1 //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "SchedClassResolution.h" 10 #include "BenchmarkResult.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/MC/MCAsmInfo.h" 13 #include "llvm/MCA/Support.h" 14 #include "llvm/Support/FormatVariadic.h" 15 #include <limits> 16 #include <unordered_set> 17 #include <vector> 18 19 namespace llvm { 20 namespace exegesis { 21 22 // Return the non-redundant list of WriteProcRes used by the given sched class. 23 // The scheduling model for LLVM is such that each instruction has a certain 24 // number of uops which consume resources which are described by WriteProcRes 25 // entries. Each entry describe how many cycles are spent on a specific ProcRes 26 // kind. 27 // For example, an instruction might have 3 uOps, one dispatching on P0 28 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7). 29 // Note that LLVM additionally denormalizes resource consumption to include 30 // usage of super resources by subresources. So in practice if there exists a 31 // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by 32 // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed 33 // by P06 are also consumed by P016. In the figure below, parenthesized cycles 34 // denote implied usage of superresources by subresources: 35 // P0 P06 P016 36 // uOp1 1 (1) (1) 37 // uOp2 1 (1) 38 // uOp3 1 (1) 39 // ============================= 40 // 1 3 3 41 // Eventually we end up with three entries for the WriteProcRes of the 42 // instruction: 43 // {ProcResIdx=1, Cycles=1} // P0 44 // {ProcResIdx=7, Cycles=3} // P06 45 // {ProcResIdx=10, Cycles=3} // P016 46 // 47 // Note that in this case, P016 does not contribute any cycles, so it would 48 // be removed by this function. 49 // FIXME: Merge this with the equivalent in llvm-mca. 50 static SmallVector<MCWriteProcResEntry, 8> 51 getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, 52 const MCSubtargetInfo &STI) { 53 SmallVector<MCWriteProcResEntry, 8> Result; 54 const auto &SM = STI.getSchedModel(); 55 const unsigned NumProcRes = SM.getNumProcResourceKinds(); 56 57 // Collect resource masks. 58 SmallVector<uint64_t> ProcResourceMasks(NumProcRes); 59 mca::computeProcResourceMasks(SM, ProcResourceMasks); 60 61 // Sort entries by smaller resources for (basic) topological ordering. 62 using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>; 63 SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries; 64 for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc), 65 *const WPREnd = STI.getWriteProcResEnd(&SCDesc); 66 WPR != WPREnd; ++WPR) { 67 uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx]; 68 ResourceMaskAndEntries.push_back({Mask, WPR}); 69 } 70 sort(ResourceMaskAndEntries, 71 [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) { 72 unsigned popcntA = llvm::popcount(A.first); 73 unsigned popcntB = llvm::popcount(B.first); 74 if (popcntA < popcntB) 75 return true; 76 if (popcntA > popcntB) 77 return false; 78 return A.first < B.first; 79 }); 80 81 SmallVector<float, 32> ProcResUnitUsage(NumProcRes); 82 for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) { 83 const MCWriteProcResEntry *WPR = Entry.second; 84 const MCProcResourceDesc *const ProcResDesc = 85 SM.getProcResource(WPR->ProcResourceIdx); 86 // TODO: Handle StartAtCycle in llvm-exegesis and llvm-mca. See 87 // https://github.com/llvm/llvm-project/issues/62680 and 88 // https://github.com/llvm/llvm-project/issues/62681 89 assert(WPR->StartAtCycle == 0 && 90 "`llvm-exegesis` does not handle StartAtCycle > 0"); 91 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 92 // This is a ProcResUnit. 93 Result.push_back({WPR->ProcResourceIdx, WPR->Cycles, WPR->StartAtCycle}); 94 ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles; 95 } else { 96 // This is a ProcResGroup. First see if it contributes any cycles or if 97 // it has cycles just from subunits. 98 float RemainingCycles = WPR->Cycles; 99 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 100 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 101 ++SubResIdx) { 102 RemainingCycles -= ProcResUnitUsage[*SubResIdx]; 103 } 104 if (RemainingCycles < 0.01f) { 105 // The ProcResGroup contributes no cycles of its own. 106 continue; 107 } 108 // The ProcResGroup contributes `RemainingCycles` cycles of its own. 109 Result.push_back({WPR->ProcResourceIdx, 110 static_cast<uint16_t>(std::round(RemainingCycles)), 111 WPR->StartAtCycle}); 112 // Spread the remaining cycles over all subunits. 113 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 114 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 115 ++SubResIdx) { 116 ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits; 117 } 118 } 119 } 120 return Result; 121 } 122 123 // Distributes a pressure budget as evenly as possible on the provided subunits 124 // given the already existing port pressure distribution. 125 // 126 // The algorithm is as follows: while there is remaining pressure to 127 // distribute, find the subunits with minimal pressure, and distribute 128 // remaining pressure equally up to the pressure of the unit with 129 // second-to-minimal pressure. 130 // For example, let's assume we want to distribute 2*P1256 131 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is: 132 // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7 133 // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5 134 // RemainingPressure = 2.0 135 // We sort the subunits by pressure: 136 // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)] 137 // We'll first start by the subunits with minimal pressure, which are at 138 // the beginning of the sorted array. In this example there is one (P2). 139 // The subunit with second-to-minimal pressure is the next one in the 140 // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles 141 // from the budget. 142 // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)] 143 // RemainingPressure = 1.9 144 // We repeat this process: distribute 0.2 pressure on each of the minimal 145 // P2 and P1, decrease budget by 2*0.2: 146 // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)] 147 // RemainingPressure = 1.5 148 // There are no second-to-minimal subunits so we just share the remaining 149 // budget (1.5 cycles) equally: 150 // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)] 151 // RemainingPressure = 0.0 152 // We stop as there is no remaining budget to distribute. 153 static void distributePressure(float RemainingPressure, 154 SmallVector<uint16_t, 32> Subunits, 155 SmallVector<float, 32> &DensePressure) { 156 // Find the number of subunits with minimal pressure (they are at the 157 // front). 158 sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) { 159 return DensePressure[A] < DensePressure[B]; 160 }); 161 const auto getPressureForSubunit = [&DensePressure, 162 &Subunits](size_t I) -> float & { 163 return DensePressure[Subunits[I]]; 164 }; 165 size_t NumMinimalSU = 1; 166 while (NumMinimalSU < Subunits.size() && 167 getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) { 168 ++NumMinimalSU; 169 } 170 while (RemainingPressure > 0.0f) { 171 if (NumMinimalSU == Subunits.size()) { 172 // All units are minimal, just distribute evenly and be done. 173 for (size_t I = 0; I < NumMinimalSU; ++I) { 174 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 175 } 176 return; 177 } 178 // Distribute the remaining pressure equally. 179 const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1); 180 const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU); 181 assert(MinimalPressure < SecondToMinimalPressure); 182 const float Increment = SecondToMinimalPressure - MinimalPressure; 183 if (RemainingPressure <= NumMinimalSU * Increment) { 184 // There is not enough remaining pressure. 185 for (size_t I = 0; I < NumMinimalSU; ++I) { 186 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 187 } 188 return; 189 } 190 // Bump all minimal pressure subunits to `SecondToMinimalPressure`. 191 for (size_t I = 0; I < NumMinimalSU; ++I) { 192 getPressureForSubunit(I) = SecondToMinimalPressure; 193 RemainingPressure -= SecondToMinimalPressure; 194 } 195 while (NumMinimalSU < Subunits.size() && 196 getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) { 197 ++NumMinimalSU; 198 } 199 } 200 } 201 202 std::vector<std::pair<uint16_t, float>> 203 computeIdealizedProcResPressure(const MCSchedModel &SM, 204 SmallVector<MCWriteProcResEntry, 8> WPRS) { 205 // DensePressure[I] is the port pressure for Proc Resource I. 206 SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds()); 207 sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) { 208 return A.ProcResourceIdx < B.ProcResourceIdx; 209 }); 210 for (const MCWriteProcResEntry &WPR : WPRS) { 211 // Get units for the entry. 212 const MCProcResourceDesc *const ProcResDesc = 213 SM.getProcResource(WPR.ProcResourceIdx); 214 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 215 // This is a ProcResUnit. 216 DensePressure[WPR.ProcResourceIdx] += WPR.Cycles; 217 } else { 218 // This is a ProcResGroup. 219 SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin, 220 ProcResDesc->SubUnitsIdxBegin + 221 ProcResDesc->NumUnits); 222 distributePressure(WPR.Cycles, Subunits, DensePressure); 223 } 224 } 225 // Turn dense pressure into sparse pressure by removing zero entries. 226 std::vector<std::pair<uint16_t, float>> Pressure; 227 for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { 228 if (DensePressure[I] > 0.0f) 229 Pressure.emplace_back(I, DensePressure[I]); 230 } 231 return Pressure; 232 } 233 234 ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI, 235 unsigned ResolvedSchedClassId, 236 bool WasVariant) 237 : SchedClassId(ResolvedSchedClassId), 238 SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)), 239 WasVariant(WasVariant), 240 NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)), 241 IdealizedProcResPressure(computeIdealizedProcResPressure( 242 STI.getSchedModel(), NonRedundantWriteProcRes)) { 243 assert((SCDesc == nullptr || !SCDesc->isVariant()) && 244 "ResolvedSchedClass should never be variant"); 245 } 246 247 static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI, 248 const MCInstrInfo &InstrInfo, 249 unsigned SchedClassId, 250 const MCInst &MCI) { 251 const auto &SM = STI.getSchedModel(); 252 while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) { 253 SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo, 254 SM.getProcessorID()); 255 } 256 return SchedClassId; 257 } 258 259 std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/> 260 ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo, 261 const MCInstrInfo &InstrInfo, 262 const MCInst &MCI) { 263 unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass(); 264 const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel() 265 .getSchedClassDesc(SchedClassId) 266 ->isVariant(); 267 SchedClassId = 268 ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI); 269 return std::make_pair(SchedClassId, WasVariant); 270 } 271 272 // Returns a ProxResIdx by id or name. 273 static unsigned findProcResIdx(const MCSubtargetInfo &STI, 274 const StringRef NameOrId) { 275 // Interpret the key as an ProcResIdx. 276 unsigned ProcResIdx = 0; 277 if (to_integer(NameOrId, ProcResIdx, 10)) 278 return ProcResIdx; 279 // Interpret the key as a ProcRes name. 280 const auto &SchedModel = STI.getSchedModel(); 281 for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) { 282 if (NameOrId == SchedModel.getProcResource(I)->Name) 283 return I; 284 } 285 return 0; 286 } 287 288 std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint( 289 Benchmark::ModeE Mode, const MCSubtargetInfo &STI, 290 ArrayRef<PerInstructionStats> Representative) const { 291 const size_t NumMeasurements = Representative.size(); 292 293 std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements); 294 295 if (Mode == Benchmark::Latency) { 296 assert(NumMeasurements == 1 && "Latency is a single measure."); 297 BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0]; 298 299 // Find the latency. 300 LatencyMeasure.PerInstructionValue = 0.0; 301 302 for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) { 303 const MCWriteLatencyEntry *const WLE = 304 STI.getWriteLatencyEntry(SCDesc, I); 305 LatencyMeasure.PerInstructionValue = 306 std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles); 307 } 308 } else if (Mode == Benchmark::Uops) { 309 for (auto I : zip(SchedClassPoint, Representative)) { 310 BenchmarkMeasure &Measure = std::get<0>(I); 311 const PerInstructionStats &Stats = std::get<1>(I); 312 313 StringRef Key = Stats.key(); 314 uint16_t ProcResIdx = findProcResIdx(STI, Key); 315 if (ProcResIdx > 0) { 316 // Find the pressure on ProcResIdx `Key`. 317 const auto ProcResPressureIt = 318 llvm::find_if(IdealizedProcResPressure, 319 [ProcResIdx](const std::pair<uint16_t, float> &WPR) { 320 return WPR.first == ProcResIdx; 321 }); 322 Measure.PerInstructionValue = 323 ProcResPressureIt == IdealizedProcResPressure.end() 324 ? 0.0 325 : ProcResPressureIt->second; 326 } else if (Key == "NumMicroOps") { 327 Measure.PerInstructionValue = SCDesc->NumMicroOps; 328 } else { 329 errs() << "expected `key` to be either a ProcResIdx or a ProcRes " 330 "name, got " 331 << Key << "\n"; 332 return {}; 333 } 334 } 335 } else if (Mode == Benchmark::InverseThroughput) { 336 assert(NumMeasurements == 1 && "Inverse Throughput is a single measure."); 337 BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0]; 338 339 RThroughputMeasure.PerInstructionValue = 340 MCSchedModel::getReciprocalThroughput(STI, *SCDesc); 341 } else { 342 llvm_unreachable("unimplemented measurement matching mode"); 343 } 344 345 return SchedClassPoint; 346 } 347 348 } // namespace exegesis 349 } // namespace llvm 350