1 //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "SchedClassResolution.h" 10 #include "BenchmarkResult.h" 11 #include "llvm/ADT/STLExtras.h" 12 #include "llvm/MC/MCAsmInfo.h" 13 #include "llvm/MCA/Support.h" 14 #include "llvm/Support/FormatVariadic.h" 15 #include <limits> 16 #include <unordered_set> 17 #include <vector> 18 19 namespace llvm { 20 namespace exegesis { 21 22 // Return the non-redundant list of WriteProcRes used by the given sched class. 23 // The scheduling model for LLVM is such that each instruction has a certain 24 // number of uops which consume resources which are described by WriteProcRes 25 // entries. Each entry describe how many cycles are spent on a specific ProcRes 26 // kind. 27 // For example, an instruction might have 3 uOps, one dispatching on P0 28 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7). 29 // Note that LLVM additionally denormalizes resource consumption to include 30 // usage of super resources by subresources. So in practice if there exists a 31 // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by 32 // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed 33 // by P06 are also consumed by P016. In the figure below, parenthesized cycles 34 // denote implied usage of superresources by subresources: 35 // P0 P06 P016 36 // uOp1 1 (1) (1) 37 // uOp2 1 (1) 38 // uOp3 1 (1) 39 // ============================= 40 // 1 3 3 41 // Eventually we end up with three entries for the WriteProcRes of the 42 // instruction: 43 // {ProcResIdx=1, Cycles=1} // P0 44 // {ProcResIdx=7, Cycles=3} // P06 45 // {ProcResIdx=10, Cycles=3} // P016 46 // 47 // Note that in this case, P016 does not contribute any cycles, so it would 48 // be removed by this function. 49 // FIXME: Merge this with the equivalent in llvm-mca. 50 static SmallVector<MCWriteProcResEntry, 8> 51 getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, 52 const MCSubtargetInfo &STI) { 53 SmallVector<MCWriteProcResEntry, 8> Result; 54 const auto &SM = STI.getSchedModel(); 55 const unsigned NumProcRes = SM.getNumProcResourceKinds(); 56 57 // Collect resource masks. 58 SmallVector<uint64_t> ProcResourceMasks(NumProcRes); 59 mca::computeProcResourceMasks(SM, ProcResourceMasks); 60 61 // Sort entries by smaller resources for (basic) topological ordering. 62 using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>; 63 SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries; 64 for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc), 65 *const WPREnd = STI.getWriteProcResEnd(&SCDesc); 66 WPR != WPREnd; ++WPR) { 67 uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx]; 68 ResourceMaskAndEntries.push_back({Mask, WPR}); 69 } 70 sort(ResourceMaskAndEntries, 71 [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) { 72 unsigned popcntA = llvm::popcount(A.first); 73 unsigned popcntB = llvm::popcount(B.first); 74 if (popcntA < popcntB) 75 return true; 76 if (popcntA > popcntB) 77 return false; 78 return A.first < B.first; 79 }); 80 81 SmallVector<float, 32> ProcResUnitUsage(NumProcRes); 82 for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) { 83 const MCWriteProcResEntry *WPR = Entry.second; 84 const MCProcResourceDesc *const ProcResDesc = 85 SM.getProcResource(WPR->ProcResourceIdx); 86 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 87 // This is a ProcResUnit. 88 Result.push_back({WPR->ProcResourceIdx, WPR->Cycles}); 89 ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles; 90 } else { 91 // This is a ProcResGroup. First see if it contributes any cycles or if 92 // it has cycles just from subunits. 93 float RemainingCycles = WPR->Cycles; 94 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 95 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 96 ++SubResIdx) { 97 RemainingCycles -= ProcResUnitUsage[*SubResIdx]; 98 } 99 if (RemainingCycles < 0.01f) { 100 // The ProcResGroup contributes no cycles of its own. 101 continue; 102 } 103 // The ProcResGroup contributes `RemainingCycles` cycles of its own. 104 Result.push_back({WPR->ProcResourceIdx, 105 static_cast<uint16_t>(std::round(RemainingCycles))}); 106 // Spread the remaining cycles over all subunits. 107 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; 108 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; 109 ++SubResIdx) { 110 ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits; 111 } 112 } 113 } 114 return Result; 115 } 116 117 // Distributes a pressure budget as evenly as possible on the provided subunits 118 // given the already existing port pressure distribution. 119 // 120 // The algorithm is as follows: while there is remaining pressure to 121 // distribute, find the subunits with minimal pressure, and distribute 122 // remaining pressure equally up to the pressure of the unit with 123 // second-to-minimal pressure. 124 // For example, let's assume we want to distribute 2*P1256 125 // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is: 126 // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7 127 // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5 128 // RemainingPressure = 2.0 129 // We sort the subunits by pressure: 130 // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)] 131 // We'll first start by the subunits with minimal pressure, which are at 132 // the beginning of the sorted array. In this example there is one (P2). 133 // The subunit with second-to-minimal pressure is the next one in the 134 // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles 135 // from the budget. 136 // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)] 137 // RemainingPressure = 1.9 138 // We repeat this process: distribute 0.2 pressure on each of the minimal 139 // P2 and P1, decrease budget by 2*0.2: 140 // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)] 141 // RemainingPressure = 1.5 142 // There are no second-to-minimal subunits so we just share the remaining 143 // budget (1.5 cycles) equally: 144 // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)] 145 // RemainingPressure = 0.0 146 // We stop as there is no remaining budget to distribute. 147 static void distributePressure(float RemainingPressure, 148 SmallVector<uint16_t, 32> Subunits, 149 SmallVector<float, 32> &DensePressure) { 150 // Find the number of subunits with minimal pressure (they are at the 151 // front). 152 sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) { 153 return DensePressure[A] < DensePressure[B]; 154 }); 155 const auto getPressureForSubunit = [&DensePressure, 156 &Subunits](size_t I) -> float & { 157 return DensePressure[Subunits[I]]; 158 }; 159 size_t NumMinimalSU = 1; 160 while (NumMinimalSU < Subunits.size() && 161 getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) { 162 ++NumMinimalSU; 163 } 164 while (RemainingPressure > 0.0f) { 165 if (NumMinimalSU == Subunits.size()) { 166 // All units are minimal, just distribute evenly and be done. 167 for (size_t I = 0; I < NumMinimalSU; ++I) { 168 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 169 } 170 return; 171 } 172 // Distribute the remaining pressure equally. 173 const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1); 174 const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU); 175 assert(MinimalPressure < SecondToMinimalPressure); 176 const float Increment = SecondToMinimalPressure - MinimalPressure; 177 if (RemainingPressure <= NumMinimalSU * Increment) { 178 // There is not enough remaining pressure. 179 for (size_t I = 0; I < NumMinimalSU; ++I) { 180 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU; 181 } 182 return; 183 } 184 // Bump all minimal pressure subunits to `SecondToMinimalPressure`. 185 for (size_t I = 0; I < NumMinimalSU; ++I) { 186 getPressureForSubunit(I) = SecondToMinimalPressure; 187 RemainingPressure -= SecondToMinimalPressure; 188 } 189 while (NumMinimalSU < Subunits.size() && 190 getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) { 191 ++NumMinimalSU; 192 } 193 } 194 } 195 196 std::vector<std::pair<uint16_t, float>> 197 computeIdealizedProcResPressure(const MCSchedModel &SM, 198 SmallVector<MCWriteProcResEntry, 8> WPRS) { 199 // DensePressure[I] is the port pressure for Proc Resource I. 200 SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds()); 201 sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) { 202 return A.ProcResourceIdx < B.ProcResourceIdx; 203 }); 204 for (const MCWriteProcResEntry &WPR : WPRS) { 205 // Get units for the entry. 206 const MCProcResourceDesc *const ProcResDesc = 207 SM.getProcResource(WPR.ProcResourceIdx); 208 if (ProcResDesc->SubUnitsIdxBegin == nullptr) { 209 // This is a ProcResUnit. 210 DensePressure[WPR.ProcResourceIdx] += WPR.Cycles; 211 } else { 212 // This is a ProcResGroup. 213 SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin, 214 ProcResDesc->SubUnitsIdxBegin + 215 ProcResDesc->NumUnits); 216 distributePressure(WPR.Cycles, Subunits, DensePressure); 217 } 218 } 219 // Turn dense pressure into sparse pressure by removing zero entries. 220 std::vector<std::pair<uint16_t, float>> Pressure; 221 for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { 222 if (DensePressure[I] > 0.0f) 223 Pressure.emplace_back(I, DensePressure[I]); 224 } 225 return Pressure; 226 } 227 228 ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI, 229 unsigned ResolvedSchedClassId, 230 bool WasVariant) 231 : SchedClassId(ResolvedSchedClassId), 232 SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)), 233 WasVariant(WasVariant), 234 NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)), 235 IdealizedProcResPressure(computeIdealizedProcResPressure( 236 STI.getSchedModel(), NonRedundantWriteProcRes)) { 237 assert((SCDesc == nullptr || !SCDesc->isVariant()) && 238 "ResolvedSchedClass should never be variant"); 239 } 240 241 static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI, 242 const MCInstrInfo &InstrInfo, 243 unsigned SchedClassId, 244 const MCInst &MCI) { 245 const auto &SM = STI.getSchedModel(); 246 while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) { 247 SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo, 248 SM.getProcessorID()); 249 } 250 return SchedClassId; 251 } 252 253 std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/> 254 ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo, 255 const MCInstrInfo &InstrInfo, 256 const MCInst &MCI) { 257 unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass(); 258 const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel() 259 .getSchedClassDesc(SchedClassId) 260 ->isVariant(); 261 SchedClassId = 262 ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI); 263 return std::make_pair(SchedClassId, WasVariant); 264 } 265 266 // Returns a ProxResIdx by id or name. 267 static unsigned findProcResIdx(const MCSubtargetInfo &STI, 268 const StringRef NameOrId) { 269 // Interpret the key as an ProcResIdx. 270 unsigned ProcResIdx = 0; 271 if (to_integer(NameOrId, ProcResIdx, 10)) 272 return ProcResIdx; 273 // Interpret the key as a ProcRes name. 274 const auto &SchedModel = STI.getSchedModel(); 275 for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) { 276 if (NameOrId == SchedModel.getProcResource(I)->Name) 277 return I; 278 } 279 return 0; 280 } 281 282 std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint( 283 InstructionBenchmark::ModeE Mode, const MCSubtargetInfo &STI, 284 ArrayRef<PerInstructionStats> Representative) const { 285 const size_t NumMeasurements = Representative.size(); 286 287 std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements); 288 289 if (Mode == InstructionBenchmark::Latency) { 290 assert(NumMeasurements == 1 && "Latency is a single measure."); 291 BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0]; 292 293 // Find the latency. 294 LatencyMeasure.PerInstructionValue = 0.0; 295 296 for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) { 297 const MCWriteLatencyEntry *const WLE = 298 STI.getWriteLatencyEntry(SCDesc, I); 299 LatencyMeasure.PerInstructionValue = 300 std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles); 301 } 302 } else if (Mode == InstructionBenchmark::Uops) { 303 for (auto I : zip(SchedClassPoint, Representative)) { 304 BenchmarkMeasure &Measure = std::get<0>(I); 305 const PerInstructionStats &Stats = std::get<1>(I); 306 307 StringRef Key = Stats.key(); 308 uint16_t ProcResIdx = findProcResIdx(STI, Key); 309 if (ProcResIdx > 0) { 310 // Find the pressure on ProcResIdx `Key`. 311 const auto ProcResPressureIt = 312 llvm::find_if(IdealizedProcResPressure, 313 [ProcResIdx](const std::pair<uint16_t, float> &WPR) { 314 return WPR.first == ProcResIdx; 315 }); 316 Measure.PerInstructionValue = 317 ProcResPressureIt == IdealizedProcResPressure.end() 318 ? 0.0 319 : ProcResPressureIt->second; 320 } else if (Key == "NumMicroOps") { 321 Measure.PerInstructionValue = SCDesc->NumMicroOps; 322 } else { 323 errs() << "expected `key` to be either a ProcResIdx or a ProcRes " 324 "name, got " 325 << Key << "\n"; 326 return {}; 327 } 328 } 329 } else if (Mode == InstructionBenchmark::InverseThroughput) { 330 assert(NumMeasurements == 1 && "Inverse Throughput is a single measure."); 331 BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0]; 332 333 RThroughputMeasure.PerInstructionValue = 334 MCSchedModel::getReciprocalThroughput(STI, *SCDesc); 335 } else { 336 llvm_unreachable("unimplemented measurement matching mode"); 337 } 338 339 return SchedClassPoint; 340 } 341 342 } // namespace exegesis 343 } // namespace llvm 344