//===-- Analysis.cpp --------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Analysis.h" #include "BenchmarkResult.h" #include "llvm/Support/FormatVariadic.h" #include #include namespace exegesis { static const char kCsvSep = ','; namespace { enum EscapeTag { kEscapeCsv, kEscapeHtml }; template void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S); template <> void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S) { if (std::find(S.begin(), S.end(), kCsvSep) == S.end()) { OS << S; } else { // Needs escaping. OS << '"'; for (const char C : S) { if (C == '"') OS << "\"\""; else OS << C; } OS << '"'; } } template <> void writeEscaped(llvm::raw_ostream &OS, const llvm::StringRef S) { for (const char C : S) { if (C == '<') OS << "<"; else if (C == '>') OS << ">"; else if (C == '&') OS << "&"; else OS << C; } } } // namespace template static void writeClusterId(llvm::raw_ostream &OS, const InstructionBenchmarkClustering::ClusterId &CID) { if (CID.isNoise()) writeEscaped(OS, "[noise]"); else if (CID.isError()) writeEscaped(OS, "[error]"); else OS << CID.getId(); } template static void writeMeasurementValue(llvm::raw_ostream &OS, const double Value) { writeEscaped(OS, llvm::formatv("{0:F}", Value).str()); } // Prints a row representing an instruction, along with scheduling info and // point coordinates (measurements). void Analysis::printInstructionRowCsv(const size_t PointId, llvm::raw_ostream &OS) const { const InstructionBenchmark &Point = Clustering_.getPoints()[PointId]; writeClusterId(OS, Clustering_.getClusterIdForPoint(PointId)); OS << kCsvSep; writeEscaped(OS, Point.Key.OpcodeName); OS << kCsvSep; writeEscaped(OS, Point.Key.Config); OS << kCsvSep; const auto OpcodeIt = MnemonicToOpcode_.find(Point.Key.OpcodeName); if (OpcodeIt != MnemonicToOpcode_.end()) { const unsigned SchedClassId = InstrInfo_->get(OpcodeIt->second).getSchedClass(); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const auto &SchedModel = SubtargetInfo_->getSchedModel(); const llvm::MCSchedClassDesc *const SCDesc = SchedModel.getSchedClassDesc(SchedClassId); writeEscaped(OS, SCDesc->Name); #else OS << SchedClassId; #endif } // FIXME: Print the sched class once InstructionBenchmark separates key into // (mnemonic, mode, opaque). for (const auto &Measurement : Point.Measurements) { OS << kCsvSep; writeMeasurementValue(OS, Measurement.Value); } OS << "\n"; } Analysis::Analysis(const llvm::Target &Target, const InstructionBenchmarkClustering &Clustering) : Clustering_(Clustering) { if (Clustering.getPoints().empty()) return; InstrInfo_.reset(Target.createMCInstrInfo()); const InstructionBenchmark &FirstPoint = Clustering.getPoints().front(); SubtargetInfo_.reset(Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, FirstPoint.CpuName, "")); // Build an index of mnemonic->opcode. for (int I = 0, E = InstrInfo_->getNumOpcodes(); I < E; ++I) MnemonicToOpcode_.emplace(InstrInfo_->getName(I), I); } template <> llvm::Error Analysis::run(llvm::raw_ostream &OS) const { if (Clustering_.getPoints().empty()) return llvm::Error::success(); // Write the header. OS << "cluster_id" << kCsvSep << "opcode_name" << kCsvSep << "config" << kCsvSep << "sched_class"; for (const auto &Measurement : Clustering_.getPoints().front().Measurements) { OS << kCsvSep; writeEscaped(OS, Measurement.Key); } OS << "\n"; // Write the points. const auto &Clusters = Clustering_.getValidClusters(); for (size_t I = 0, E = Clusters.size(); I < E; ++I) { for (const size_t PointId : Clusters[I].PointIndices) { printInstructionRowCsv(PointId, OS); } OS << "\n\n"; } return llvm::Error::success(); } std::unordered_map> Analysis::makePointsPerSchedClass() const { std::unordered_map> PointsPerSchedClass; const auto &Points = Clustering_.getPoints(); for (size_t PointId = 0, E = Points.size(); PointId < E; ++PointId) { const InstructionBenchmark &Point = Points[PointId]; if (!Point.Error.empty()) continue; const auto OpcodeIt = MnemonicToOpcode_.find(Point.Key.OpcodeName); if (OpcodeIt == MnemonicToOpcode_.end()) continue; const unsigned SchedClassId = InstrInfo_->get(OpcodeIt->second).getSchedClass(); PointsPerSchedClass[SchedClassId].push_back(PointId); } return PointsPerSchedClass; } void Analysis::printSchedClassClustersHtml(std::vector PointIds, llvm::raw_ostream &OS) const { assert(!PointIds.empty()); // Sort the points by cluster id so that we can display them grouped by // cluster. std::sort(PointIds.begin(), PointIds.end(), [this](const size_t A, const size_t B) { return Clustering_.getClusterIdForPoint(A) < Clustering_.getClusterIdForPoint(B); }); const auto &Points = Clustering_.getPoints(); OS << ""; OS << ""; for (const auto &Measurement : Points[PointIds[0]].Measurements) { OS << ""; } OS << ""; for (size_t I = 0, E = PointIds.size(); I < E;) { const auto &CurrentClusterId = Clustering_.getClusterIdForPoint(PointIds[I]); OS << ""; for (const auto &Measurement : ClusterRepresentative.Measurements) { OS << ""; } OS << ""; } OS << "

ClusterId	Opcode/Config	"; if (Measurement.DebugString.empty()) writeEscaped(OS, Measurement.Key); else writeEscaped(OS, Measurement.DebugString); OS << "
"; writeClusterId(OS, CurrentClusterId); OS << "	"; const auto &ClusterRepresentative = Points[PointIds[I]]; // FIXME: average measurements. for (; I < E && Clustering_.getClusterIdForPoint(PointIds[I]) == CurrentClusterId; ++I) { OS << " "; writeEscaped(OS, Points[PointIds[I]].Key.OpcodeName); OS << " "; writeEscaped(OS, Points[PointIds[I]].Key.Config); OS << " "; } OS << "	"; writeMeasurementValue(OS, Measurement.Value); OS << "

"; } // Return the non-redundant list of WriteProcRes used by the given sched class. // The scheduling model for LLVM is such that each instruction has a certain // number of uops which consume resources which are described by WriteProcRes // entries. Each entry describe how many cycles are spent on a specific ProcRes // kind. // For example, an instruction might have 3 uOps, one dispatching on P0 // (ProcResIdx=1) and two on P06 (ProcResIdx = 7). // Note that LLVM additionally denormalizes resource consumption to include // usage of super resources by subresources. So in practice if there exists a // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed // by P06 are also consumed by P016. In the figure below, parenthesized cycles // denote implied usage of superresources by subresources: // P0 P06 P016 // uOp1 1 (1) (1) // uOp2 1 (1) // uOp3 1 (1) // ============================= // 1 3 3 // Eventually we end up with three entries for the WriteProcRes of the // instruction: // {ProcResIdx=1, Cycles=1} // P0 // {ProcResIdx=7, Cycles=3} // P06 // {ProcResIdx=10, Cycles=3} // P016 // // Note that in this case, P016 does not contribute any cycles, so it would // be removed by this function. // FIXME: Move this to MCSubtargetInfo and use it in llvm-mca. static llvm::SmallVector getNonRedundantWriteProcRes(const llvm::MCSchedClassDesc &SCDesc, const llvm::MCSubtargetInfo &STI) { llvm::SmallVector Result; const auto &SM = STI.getSchedModel(); const unsigned NumProcRes = SM.getNumProcResourceKinds(); // This assumes that the ProcResDescs are sorted in topological order, which // is guaranteed by the tablegen backend. llvm::SmallVector ProcResUnitUsage(NumProcRes); for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc), *const WPREnd = STI.getWriteProcResEnd(&SCDesc); WPR != WPREnd; ++WPR) { const llvm::MCProcResourceDesc *const ProcResDesc = SM.getProcResource(WPR->ProcResourceIdx); if (ProcResDesc->SubUnitsIdxBegin == nullptr) { // This is a ProcResUnit. Result.push_back({WPR->ProcResourceIdx, WPR->Cycles}); ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles; } else { // This is a ProcResGroup. First see if it contributes any cycles or if // it has cycles just from subunits. float RemainingCycles = WPR->Cycles; for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; ++SubResIdx) { RemainingCycles -= ProcResUnitUsage[*SubResIdx]; } if (RemainingCycles < 0.01f) { // The ProcResGroup contributes no cycles of its own. continue; } // The ProcResGroup contributes `RemainingCycles` cycles of its own. Result.push_back({WPR->ProcResourceIdx, static_cast(std::round(RemainingCycles))}); // Spread the remaining cycles over all subunits. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin; SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits; ++SubResIdx) { ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits; } } } return Result; } void Analysis::printSchedClassDescHtml(const llvm::MCSchedClassDesc &SCDesc, llvm::raw_ostream &OS) const { OS << ""; OS << ""; if (SCDesc.isValid()) { OS << ""; OS << ""; OS << ""; // Latencies. OS << ""; // WriteProcRes. OS << ""; OS << ""; } else { OS << ""; } OS << "

Valid	Variant	uOps	Latency	WriteProcRes
✔	" << (SCDesc.isVariant() ? "✔" : "✕") << "	" << SCDesc.NumMicroOps << "	"; for (int I = 0, E = SCDesc.NumWriteLatencyEntries; I < E; ++I) { const auto *const Entry = SubtargetInfo_->getWriteLatencyEntry(&SCDesc, I); OS << " " << Entry->Cycles; if (SCDesc.NumWriteLatencyEntries > 1) { // Dismabiguate if more than 1 latency. OS << " (WriteResourceID " << Entry->WriteResourceID << ")"; } OS << " "; } OS << "	"; for (const auto &WPR : getNonRedundantWriteProcRes(SCDesc, *SubtargetInfo_)) { OS << " "; writeEscaped(OS, SubtargetInfo_->getSchedModel() .getProcResource(WPR.ProcResourceIdx) ->Name); OS << ": " << WPR.Cycles << " "; } OS << "
✕

"; } static constexpr const char kHtmlHead[] = R"( llvm-exegesis Analysis Results )"; template <> llvm::Error Analysis::run( llvm::raw_ostream &OS) const { // Print the header. OS << "" << kHtmlHead << ""; OS << "

llvm-exegesis Analysis Results

"; OS << "

Triple: "; writeEscaped(OS, Clustering_.getPoints()[0].LLVMTriple); OS << "

Cpu: "; writeEscaped(OS, Clustering_.getPoints()[0].CpuName); OS << "

"; // All the points in a scheduling class should be in the same cluster. // Print any scheduling class for which this is not the case. for (const auto &SchedClassAndPoints : makePointsPerSchedClass()) { std::unordered_set ClustersForSchedClass; for (const size_t PointId : SchedClassAndPoints.second) { const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId); if (!ClusterId.isValid()) continue; // Ignore noise and errors. ClustersForSchedClass.insert(ClusterId.getId()); } if (ClustersForSchedClass.size() <= 1) continue; // Nothing weird. const auto &SchedModel = SubtargetInfo_->getSchedModel(); const llvm::MCSchedClassDesc *const SCDesc = SchedModel.getSchedClassDesc(SchedClassAndPoints.first); if (!SCDesc) continue; OS << "

Sched Class "; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) writeEscaped(OS, SCDesc->Name); #else OS << SchedClassAndPoints.first; #endif OS << " contains instructions with distinct performance " "characteristics, falling into " << ClustersForSchedClass.size() << " clusters:

"; printSchedClassClustersHtml(SchedClassAndPoints.second, OS); OS << "

llvm data:

"; printSchedClassDescHtml(*SCDesc, OS); OS << "

"; } OS << ""; return llvm::Error::success(); } } // namespace exegesis