1 //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "SchedClassResolution.h" 10 #include "BenchmarkResult.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/MC/MCAsmInfo.h" 13 #include "llvm/MCA/Support.h" 14 #include "llvm/Support/FormatVariadic.h" 15 #include <limits> 16 #include <unordered_set> 17 #include <vector> 18 19 namespace llvm { 20 namespace exegesis { 21 22 // Return the non-redundant list of WriteProcRes used by the given sched class. 23 // The scheduling model for LLVM is such that each instruction has a certain 24 // number of uops which consume resources which are described by WriteProcRes 25 // entries. Each entry describe how many cycles are spent on a specific ProcRes 26 // kind. 27 // For example, an instruction might have 3 uOps, one dispatching on P0 28 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7). 29 // Note that LLVM additionally denormalizes resource consumption to include 30 // usage of super resources by subresources. So in practice if there exists a 31 // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by 32 // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed 33 // by P06 are also consumed by P016. In the figure below, parenthesized cycles 34 // denote implied usage of superresources by subresources: 35 // P0 P06 P016 36 // uOp1 1 (1) (1) 37 // uOp2 1 (1) 38 // uOp3 1 (1) 39 // ============================= 40 // 1 3 3 41 // Eventually we end up with three entries for the WriteProcRes of the 42 // instruction: 43 // {ProcResIdx=1, Cycles=1} // P0 44 // {ProcResIdx=7, Cycles=3} // P06 45 // {ProcResIdx=10, Cycles=3} // P016 46 // 47 // Note that in this case, P016 does not contribute any cycles, so it would 48 // be removed by this function. 49 // FIXME: Merge this with the equivalent in llvm-mca. 50 static SmallVector<MCWriteProcResEntry, 8> 51 getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, 52 const MCSubtargetInfo &STI) { 53 SmallVector<MCWriteProcResEntry, 8> Result; 54 const auto &SM = STI.getSchedModel(); 55 const unsigned NumProcRes = SM.getNumProcResourceKinds(); 56 57 // Collect resource masks. 58 SmallVector<uint64_t> ProcResourceMasks(NumProcRes); 59 mca::computeProcResourceMasks(SM, ProcResourceMasks); 60 61 // Sort entries by smaller resources for (basic) topological ordering. 62 using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>; 63 SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries; 64 for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc), 65 *const WPREnd = STI.getWriteProcResEnd(&SCDesc); 66 WPR != WPREnd; ++WPR) { 67 uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx]; 68 ResourceMaskAndEntries.push_back({Mask, WPR}); 69 } 70 sort(ResourceMaskAndEntries, 71 [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) { 72 unsigned popcntA = llvm::popcount(A.first); 73 unsigned popcntB = llvm::popcount(B.first); 74 if (popcntA < popcntB) 75 return true; 76 if (popcntA > popcntB) 77 return false; 78 return A.first < B.first; 79 }); 80 81 SmallVector<float, 32> ProcResUnitUsage(NumProcRes); 82 for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) { 83 const MCWriteProcResEntry *WPR = Entry.second; 84 const MCProcResourceDesc *const ProcResDesc = 85 SM.getProcResource(WPR->ProcResourceIdx); 86 // TODO: Handle AcquireAtAtCycle in llvm-exegesis and llvm-mca. See 87 // https://github.com/llvm/llvm-project/issues/62680 and 88 // https://github.com/llvm/llvm-project/issues/62681 89 assert(WPR->AcquireAtCycle == 0 && 90 "`llvm-exegesis` does not handle AcquireAtCycle > 0"); 91 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 92 // This is a ProcResUnit. 93 Result.push_back( 94 {WPR->ProcResourceIdx, WPR->ReleaseAtCycle, WPR->AcquireAtCycle}); 95 ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->ReleaseAtCycle; 96 } else { 97 // This is a ProcResGroup. First see if it contributes any cycles or if 98 // it has cycles just from subunits. 99 float RemainingCycles = WPR->ReleaseAtCycle; 100 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 101 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 102 ++SubResIdx) { 103 RemainingCycles -= ProcResUnitUsage[*SubResIdx]; 104 } 105 if (RemainingCycles < 0.01f) { 106 // The ProcResGroup contributes no cycles of its own. 107 continue; 108 } 109 // The ProcResGroup contributes `RemainingCycles` cycles of its own. 110 Result.push_back({WPR->ProcResourceIdx, 111 static_cast<uint16_t>(std::round(RemainingCycles)), 112 WPR->AcquireAtCycle}); 113 // Spread the remaining cycles over all subunits. 114 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 115 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 116 ++SubResIdx) { 117 ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits; 118 } 119 } 120 } 121 return Result; 122 } 123 124 // Distributes a pressure budget as evenly as possible on the provided subunits 125 // given the already existing port pressure distribution. 126 // 127 // The algorithm is as follows: while there is remaining pressure to 128 // distribute, find the subunits with minimal pressure, and distribute 129 // remaining pressure equally up to the pressure of the unit with 130 // second-to-minimal pressure. 131 // For example, let's assume we want to distribute 2*P1256 132 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is: 133 // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7 134 // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5 135 // RemainingPressure = 2.0 136 // We sort the subunits by pressure: 137 // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)] 138 // We'll first start by the subunits with minimal pressure, which are at 139 // the beginning of the sorted array. In this example there is one (P2). 140 // The subunit with second-to-minimal pressure is the next one in the 141 // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles 142 // from the budget. 143 // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)] 144 // RemainingPressure = 1.9 145 // We repeat this process: distribute 0.2 pressure on each of the minimal 146 // P2 and P1, decrease budget by 2*0.2: 147 // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)] 148 // RemainingPressure = 1.5 149 // There are no second-to-minimal subunits so we just share the remaining 150 // budget (1.5 cycles) equally: 151 // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)] 152 // RemainingPressure = 0.0 153 // We stop as there is no remaining budget to distribute. 154 static void distributePressure(float RemainingPressure, 155 SmallVector<uint16_t, 32> Subunits, 156 SmallVector<float, 32> &DensePressure) { 157 // Find the number of subunits with minimal pressure (they are at the 158 // front). 159 sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) { 160 return DensePressure[A] < DensePressure[B]; 161 }); 162 const auto getPressureForSubunit = [&DensePressure, 163 &Subunits](size_t I) -> float & { 164 return DensePressure[Subunits[I]]; 165 }; 166 size_t NumMinimalSU = 1; 167 while (NumMinimalSU < Subunits.size() && 168 getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) { 169 ++NumMinimalSU; 170 } 171 while (RemainingPressure > 0.0f) { 172 if (NumMinimalSU == Subunits.size()) { 173 // All units are minimal, just distribute evenly and be done. 174 for (size_t I = 0; I < NumMinimalSU; ++I) { 175 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 176 } 177 return; 178 } 179 // Distribute the remaining pressure equally. 180 const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1); 181 const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU); 182 assert(MinimalPressure < SecondToMinimalPressure); 183 const float Increment = SecondToMinimalPressure - MinimalPressure; 184 if (RemainingPressure <= NumMinimalSU * Increment) { 185 // There is not enough remaining pressure. 186 for (size_t I = 0; I < NumMinimalSU; ++I) { 187 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 188 } 189 return; 190 } 191 // Bump all minimal pressure subunits to `SecondToMinimalPressure`. 192 for (size_t I = 0; I < NumMinimalSU; ++I) { 193 getPressureForSubunit(I) = SecondToMinimalPressure; 194 RemainingPressure -= SecondToMinimalPressure; 195 } 196 while (NumMinimalSU < Subunits.size() && 197 getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) { 198 ++NumMinimalSU; 199 } 200 } 201 } 202 203 std::vector<std::pair<uint16_t, float>> 204 computeIdealizedProcResPressure(const MCSchedModel &SM, 205 SmallVector<MCWriteProcResEntry, 8> WPRS) { 206 // DensePressure[I] is the port pressure for Proc Resource I. 207 SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds()); 208 sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) { 209 return A.ProcResourceIdx < B.ProcResourceIdx; 210 }); 211 for (const MCWriteProcResEntry &WPR : WPRS) { 212 // Get units for the entry. 213 const MCProcResourceDesc *const ProcResDesc = 214 SM.getProcResource(WPR.ProcResourceIdx); 215 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 216 // This is a ProcResUnit. 217 DensePressure[WPR.ProcResourceIdx] += WPR.ReleaseAtCycle; 218 } else { 219 // This is a ProcResGroup. 220 SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin, 221 ProcResDesc->SubUnitsIdxBegin + 222 ProcResDesc->NumUnits); 223 distributePressure(WPR.ReleaseAtCycle, Subunits, DensePressure); 224 } 225 } 226 // Turn dense pressure into sparse pressure by removing zero entries. 227 std::vector<std::pair<uint16_t, float>> Pressure; 228 for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { 229 if (DensePressure[I] > 0.0f) 230 Pressure.emplace_back(I, DensePressure[I]); 231 } 232 return Pressure; 233 } 234 235 ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI, 236 unsigned ResolvedSchedClassId, 237 bool WasVariant) 238 : SchedClassId(ResolvedSchedClassId), 239 SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)), 240 WasVariant(WasVariant), 241 NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)), 242 IdealizedProcResPressure(computeIdealizedProcResPressure( 243 STI.getSchedModel(), NonRedundantWriteProcRes)) { 244 assert((SCDesc == nullptr || !SCDesc->isVariant()) && 245 "ResolvedSchedClass should never be variant"); 246 } 247 248 static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI, 249 const MCInstrInfo &InstrInfo, 250 unsigned SchedClassId, 251 const MCInst &MCI) { 252 const auto &SM = STI.getSchedModel(); 253 while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) { 254 SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo, 255 SM.getProcessorID()); 256 } 257 return SchedClassId; 258 } 259 260 std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/> 261 ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo, 262 const MCInstrInfo &InstrInfo, 263 const MCInst &MCI) { 264 unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass(); 265 const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel() 266 .getSchedClassDesc(SchedClassId) 267 ->isVariant(); 268 SchedClassId = 269 ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI); 270 return std::make_pair(SchedClassId, WasVariant); 271 } 272 273 // Returns a ProxResIdx by id or name. 274 static unsigned findProcResIdx(const MCSubtargetInfo &STI, 275 const StringRef NameOrId) { 276 // Interpret the key as an ProcResIdx. 277 unsigned ProcResIdx = 0; 278 if (to_integer(NameOrId, ProcResIdx, 10)) 279 return ProcResIdx; 280 // Interpret the key as a ProcRes name. 281 const auto &SchedModel = STI.getSchedModel(); 282 for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) { 283 if (NameOrId == SchedModel.getProcResource(I)->Name) 284 return I; 285 } 286 return 0; 287 } 288 289 std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint( 290 Benchmark::ModeE Mode, const MCSubtargetInfo &STI, 291 ArrayRef<PerInstructionStats> Representative) const { 292 const size_t NumMeasurements = Representative.size(); 293 294 std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements); 295 296 if (Mode == Benchmark::Latency) { 297 assert(NumMeasurements == 1 && "Latency is a single measure."); 298 BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0]; 299 300 // Find the latency. 301 LatencyMeasure.PerInstructionValue = 0.0; 302 303 for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) { 304 const MCWriteLatencyEntry *const WLE = 305 STI.getWriteLatencyEntry(SCDesc, I); 306 LatencyMeasure.PerInstructionValue = 307 std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles); 308 } 309 } else if (Mode == Benchmark::Uops) { 310 for (auto I : zip(SchedClassPoint, Representative)) { 311 BenchmarkMeasure &Measure = std::get<0>(I); 312 const PerInstructionStats &Stats = std::get<1>(I); 313 314 StringRef Key = Stats.key(); 315 uint16_t ProcResIdx = findProcResIdx(STI, Key); 316 if (ProcResIdx > 0) { 317 // Find the pressure on ProcResIdx `Key`. 318 const auto ProcResPressureIt = 319 llvm::find_if(IdealizedProcResPressure, 320 [ProcResIdx](const std::pair<uint16_t, float> &WPR) { 321 return WPR.first == ProcResIdx; 322 }); 323 Measure.PerInstructionValue = 324 ProcResPressureIt == IdealizedProcResPressure.end() 325 ? 0.0 326 : ProcResPressureIt->second; 327 } else if (Key == "NumMicroOps") { 328 Measure.PerInstructionValue = SCDesc->NumMicroOps; 329 } else { 330 errs() << "expected `key` to be either a ProcResIdx or a ProcRes " 331 "name, got " 332 << Key << "\n"; 333 return {}; 334 } 335 } 336 } else if (Mode == Benchmark::InverseThroughput) { 337 assert(NumMeasurements == 1 && "Inverse Throughput is a single measure."); 338 BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0]; 339 340 RThroughputMeasure.PerInstructionValue = 341 MCSchedModel::getReciprocalThroughput(STI, *SCDesc); 342 } else { 343 llvm_unreachable("unimplemented measurement matching mode"); 344 } 345 346 return SchedClassPoint; 347 } 348 349 } // namespace exegesis 350 } // namespace llvm 351