1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Clustering.h" 10 #include "Error.h" 11 #include "SchedClassResolution.h" 12 #include "llvm/ADT/MapVector.h" 13 #include "llvm/ADT/SetVector.h" 14 #include "llvm/ADT/SmallSet.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include <algorithm> 17 #include <deque> 18 #include <string> 19 #include <vector> 20 21 namespace llvm { 22 namespace exegesis { 23 24 // The clustering problem has the following characteristics: 25 // (A) - Low dimension (dimensions are typically proc resource units, 26 // typically < 10). 27 // (B) - Number of points : ~thousands (points are measurements of an MCInst) 28 // (C) - Number of clusters: ~tens. 29 // (D) - The number of clusters is not known /a priory/. 30 // (E) - The amount of noise is relatively small. 31 // The problem is rather small. In terms of algorithms, (D) disqualifies 32 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. 33 // 34 // We've used DBSCAN here because it's simple to implement. This is a pretty 35 // straightforward and inefficient implementation of the pseudocode in [2]. 36 // 37 // [1] https://en.wikipedia.org/wiki/DBSCAN 38 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm 39 40 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not 41 // including Q). 42 void InstructionBenchmarkClustering::rangeQuery( 43 const size_t Q, std::vector<size_t> &Neighbors) const { 44 Neighbors.clear(); 45 Neighbors.reserve(Points_.size() - 1); // The Q itself isn't a neighbor. 46 const auto &QMeasurements = Points_[Q].Measurements; 47 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 48 if (P == Q) 49 continue; 50 const auto &PMeasurements = Points_[P].Measurements; 51 if (PMeasurements.empty()) // Error point. 52 continue; 53 if (isNeighbour(PMeasurements, QMeasurements, 54 AnalysisClusteringEpsilonSquared_)) { 55 Neighbors.push_back(P); 56 } 57 } 58 } 59 60 // Given a set of points, checks that all the points are neighbours 61 // up to AnalysisClusteringEpsilon. This is O(2*N). 62 bool InstructionBenchmarkClustering::areAllNeighbours( 63 ArrayRef<size_t> Pts) const { 64 // First, get the centroid of this group of points. This is O(N). 65 SchedClassClusterCentroid G; 66 for (size_t P : Pts) { 67 assert(P < Points_.size()); 68 ArrayRef<BenchmarkMeasure> Measurements = Points_[P].Measurements; 69 if (Measurements.empty()) // Error point. 70 continue; 71 G.addPoint(Measurements); 72 } 73 const std::vector<BenchmarkMeasure> Centroid = G.getAsPoint(); 74 75 // Since we will be comparing with the centroid, we need to halve the epsilon. 76 double AnalysisClusteringEpsilonHalvedSquared = 77 AnalysisClusteringEpsilonSquared_ / 4.0; 78 79 // And now check that every point is a neighbour of the centroid. Also O(N). 80 return all_of( 81 Pts, [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { 82 assert(P < Points_.size()); 83 const auto &PMeasurements = Points_[P].Measurements; 84 if (PMeasurements.empty()) // Error point. 85 return true; // Pretend that error point is a neighbour. 86 return isNeighbour(PMeasurements, Centroid, 87 AnalysisClusteringEpsilonHalvedSquared); 88 }); 89 } 90 91 InstructionBenchmarkClustering::InstructionBenchmarkClustering( 92 const std::vector<InstructionBenchmark> &Points, 93 const double AnalysisClusteringEpsilonSquared) 94 : Points_(Points), 95 AnalysisClusteringEpsilonSquared_(AnalysisClusteringEpsilonSquared), 96 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} 97 98 Error InstructionBenchmarkClustering::validateAndSetup() { 99 ClusterIdForPoint_.resize(Points_.size()); 100 // Mark erroneous measurements out. 101 // All points must have the same number of dimensions, in the same order. 102 const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; 103 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 104 const auto &Point = Points_[P]; 105 if (!Point.Error.empty()) { 106 ClusterIdForPoint_[P] = ClusterId::error(); 107 ErrorCluster_.PointIndices.push_back(P); 108 continue; 109 } 110 const auto *CurMeasurement = &Point.Measurements; 111 if (LastMeasurement) { 112 if (LastMeasurement->size() != CurMeasurement->size()) { 113 return make_error<ClusteringError>( 114 "inconsistent measurement dimensions"); 115 } 116 for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { 117 if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { 118 return make_error<ClusteringError>( 119 "inconsistent measurement dimensions keys"); 120 } 121 } 122 } 123 LastMeasurement = CurMeasurement; 124 } 125 if (LastMeasurement) { 126 NumDimensions_ = LastMeasurement->size(); 127 } 128 return Error::success(); 129 } 130 131 void InstructionBenchmarkClustering::clusterizeDbScan(const size_t MinPts) { 132 std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs. 133 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 134 if (!ClusterIdForPoint_[P].isUndef()) 135 continue; // Previously processed in inner loop. 136 rangeQuery(P, Neighbors); 137 if (Neighbors.size() + 1 < MinPts) { // Density check. 138 // The region around P is not dense enough to create a new cluster, mark 139 // as noise for now. 140 ClusterIdForPoint_[P] = ClusterId::noise(); 141 continue; 142 } 143 144 // Create a new cluster, add P. 145 Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); 146 Cluster &CurrentCluster = Clusters_.back(); 147 ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ 148 CurrentCluster.PointIndices.push_back(P); 149 150 // Process P's neighbors. 151 SetVector<size_t, std::deque<size_t>> ToProcess; 152 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 153 while (!ToProcess.empty()) { 154 // Retrieve a point from the set. 155 const size_t Q = *ToProcess.begin(); 156 ToProcess.erase(ToProcess.begin()); 157 158 if (ClusterIdForPoint_[Q].isNoise()) { 159 // Change noise point to border point. 160 ClusterIdForPoint_[Q] = CurrentCluster.Id; 161 CurrentCluster.PointIndices.push_back(Q); 162 continue; 163 } 164 if (!ClusterIdForPoint_[Q].isUndef()) { 165 continue; // Previously processed. 166 } 167 // Add Q to the current custer. 168 ClusterIdForPoint_[Q] = CurrentCluster.Id; 169 CurrentCluster.PointIndices.push_back(Q); 170 // And extend to the neighbors of Q if the region is dense enough. 171 rangeQuery(Q, Neighbors); 172 if (Neighbors.size() + 1 >= MinPts) { 173 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 174 } 175 } 176 } 177 // assert(Neighbors.capacity() == (Points_.size() - 1)); 178 // ^ True, but it is not quaranteed to be true in all the cases. 179 180 // Add noisy points to noise cluster. 181 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 182 if (ClusterIdForPoint_[P].isNoise()) { 183 NoiseCluster_.PointIndices.push_back(P); 184 } 185 } 186 } 187 188 void InstructionBenchmarkClustering::clusterizeNaive( 189 const MCSubtargetInfo &SubtargetInfo, const MCInstrInfo &InstrInfo) { 190 // Given an instruction Opcode, which sched class id's are represented, 191 // and which are the benchmarks for each sched class? 192 std::vector<SmallMapVector<unsigned, SmallVector<size_t, 1>, 1>> 193 OpcodeToSchedClassesToPoints; 194 const unsigned NumOpcodes = InstrInfo.getNumOpcodes(); 195 OpcodeToSchedClassesToPoints.resize(NumOpcodes); 196 size_t NumClusters = 0; 197 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 198 const InstructionBenchmark &Point = Points_[P]; 199 const MCInst &MCI = Point.keyInstruction(); 200 unsigned SchedClassId; 201 std::tie(SchedClassId, std::ignore) = 202 ResolvedSchedClass::resolveSchedClassId(SubtargetInfo, InstrInfo, MCI); 203 const unsigned Opcode = MCI.getOpcode(); 204 assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); 205 auto &Points = OpcodeToSchedClassesToPoints[Opcode][SchedClassId]; 206 if (Points.empty()) // If we previously have not seen any points of 207 ++NumClusters; // this opcode's sched class, then new cluster begins. 208 Points.emplace_back(P); 209 } 210 assert(NumClusters <= NumOpcodes && 211 "can't see more opcodes than there are total opcodes"); 212 assert(NumClusters <= Points_.size() && 213 "can't see more opcodes than there are total points"); 214 215 Clusters_.reserve(NumClusters); // We already know how many clusters there is. 216 for (const auto &SchedClassesOfOpcode : OpcodeToSchedClassesToPoints) { 217 if (SchedClassesOfOpcode.empty()) 218 continue; 219 for (ArrayRef<size_t> PointsOfSchedClass : 220 make_second_range(SchedClassesOfOpcode)) { 221 if (PointsOfSchedClass.empty()) 222 continue; 223 // Create a new cluster. 224 Clusters_.emplace_back(ClusterId::makeValid( 225 Clusters_.size(), 226 /*IsUnstable=*/!areAllNeighbours(PointsOfSchedClass))); 227 Cluster &CurrentCluster = Clusters_.back(); 228 // Mark points as belonging to the new cluster. 229 for (size_t P : PointsOfSchedClass) 230 ClusterIdForPoint_[P] = CurrentCluster.Id; 231 // And add all the points of this opcode's sched class to the new cluster. 232 CurrentCluster.PointIndices.reserve(PointsOfSchedClass.size()); 233 CurrentCluster.PointIndices.assign(PointsOfSchedClass.begin(), 234 PointsOfSchedClass.end()); 235 assert(CurrentCluster.PointIndices.size() == PointsOfSchedClass.size()); 236 } 237 } 238 assert(Clusters_.size() == NumClusters); 239 } 240 241 // Given an instruction Opcode, we can make benchmarks (measurements) of the 242 // instruction characteristics/performance. Then, to facilitate further analysis 243 // we group the benchmarks with *similar* characteristics into clusters. 244 // Now, this is all not entirely deterministic. Some instructions have variable 245 // characteristics, depending on their arguments. And thus, if we do several 246 // benchmarks of the same instruction Opcode, we may end up with *different* 247 // performance characteristics measurements. And when we then do clustering, 248 // these several benchmarks of the same instruction Opcode may end up being 249 // clustered into *different* clusters. This is not great for further analysis. 250 // We shall find every opcode with benchmarks not in just one cluster, and move 251 // *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. 252 void InstructionBenchmarkClustering::stabilize(unsigned NumOpcodes) { 253 // Given an instruction Opcode and Config, in which clusters do benchmarks of 254 // this instruction lie? Normally, they all should be in the same cluster. 255 struct OpcodeAndConfig { 256 explicit OpcodeAndConfig(const InstructionBenchmark &IB) 257 : Opcode(IB.keyInstruction().getOpcode()), Config(&IB.Key.Config) {} 258 unsigned Opcode; 259 const std::string *Config; 260 261 auto Tie() const -> auto { return std::tie(Opcode, *Config); } 262 263 bool operator<(const OpcodeAndConfig &O) const { return Tie() < O.Tie(); } 264 bool operator!=(const OpcodeAndConfig &O) const { return Tie() != O.Tie(); } 265 }; 266 std::map<OpcodeAndConfig, SmallSet<ClusterId, 1>> OpcodeConfigToClusterIDs; 267 // Populate OpcodeConfigToClusterIDs and UnstableOpcodes data structures. 268 assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch"); 269 for (auto Point : zip(Points_, ClusterIdForPoint_)) { 270 const ClusterId &ClusterIdOfPoint = std::get<1>(Point); 271 if (!ClusterIdOfPoint.isValid()) 272 continue; // Only process fully valid clusters. 273 const OpcodeAndConfig Key(std::get<0>(Point)); 274 SmallSet<ClusterId, 1> &ClusterIDsOfOpcode = OpcodeConfigToClusterIDs[Key]; 275 ClusterIDsOfOpcode.insert(ClusterIdOfPoint); 276 } 277 278 for (const auto &OpcodeConfigToClusterID : OpcodeConfigToClusterIDs) { 279 const SmallSet<ClusterId, 1> &ClusterIDs = OpcodeConfigToClusterID.second; 280 const OpcodeAndConfig &Key = OpcodeConfigToClusterID.first; 281 // We only care about unstable instructions. 282 if (ClusterIDs.size() < 2) 283 continue; 284 285 // Create a new unstable cluster, one per Opcode. 286 Clusters_.emplace_back(ClusterId::makeValidUnstable(Clusters_.size())); 287 Cluster &UnstableCluster = Clusters_.back(); 288 // We will find *at least* one point in each of these clusters. 289 UnstableCluster.PointIndices.reserve(ClusterIDs.size()); 290 291 // Go through every cluster which we recorded as containing benchmarks 292 // of this UnstableOpcode. NOTE: we only recorded valid clusters. 293 for (const ClusterId &CID : ClusterIDs) { 294 assert(CID.isValid() && 295 "We only recorded valid clusters, not noise/error clusters."); 296 Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. 297 // Within each cluster, go through each point, and either move it to the 298 // new unstable cluster, or 'keep' it. 299 // In this case, we'll reshuffle OldCluster.PointIndices vector 300 // so that all the points that are *not* for UnstableOpcode are first, 301 // and the rest of the points is for the UnstableOpcode. 302 const auto it = std::stable_partition( 303 OldCluster.PointIndices.begin(), OldCluster.PointIndices.end(), 304 [this, &Key](size_t P) { 305 return OpcodeAndConfig(Points_[P]) != Key; 306 }); 307 assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && 308 "Should have found at least one bad point"); 309 // Mark to-be-moved points as belonging to the new cluster. 310 std::for_each(it, OldCluster.PointIndices.end(), 311 [this, &UnstableCluster](size_t P) { 312 ClusterIdForPoint_[P] = UnstableCluster.Id; 313 }); 314 // Actually append to-be-moved points to the new cluster. 315 UnstableCluster.PointIndices.insert(UnstableCluster.PointIndices.end(), 316 it, OldCluster.PointIndices.end()); 317 // And finally, remove "to-be-moved" points from the old cluster. 318 OldCluster.PointIndices.erase(it, OldCluster.PointIndices.end()); 319 // Now, the old cluster may end up being empty, but let's just keep it 320 // in whatever state it ended up. Purging empty clusters isn't worth it. 321 }; 322 assert(UnstableCluster.PointIndices.size() > 1 && 323 "New unstable cluster should end up with more than one point."); 324 assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && 325 "New unstable cluster should end up with no less points than there " 326 "was clusters"); 327 } 328 } 329 330 Expected<InstructionBenchmarkClustering> InstructionBenchmarkClustering::create( 331 const std::vector<InstructionBenchmark> &Points, const ModeE Mode, 332 const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, 333 const MCSubtargetInfo *SubtargetInfo, const MCInstrInfo *InstrInfo) { 334 InstructionBenchmarkClustering Clustering( 335 Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); 336 if (auto Error = Clustering.validateAndSetup()) { 337 return std::move(Error); 338 } 339 if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { 340 return Clustering; // Nothing to cluster. 341 } 342 343 if (Mode == ModeE::Dbscan) { 344 Clustering.clusterizeDbScan(DbscanMinPts); 345 346 if (InstrInfo) 347 Clustering.stabilize(InstrInfo->getNumOpcodes()); 348 } else /*if(Mode == ModeE::Naive)*/ { 349 if (!SubtargetInfo || !InstrInfo) 350 return make_error<Failure>("'naive' clustering mode requires " 351 "SubtargetInfo and InstrInfo to be present"); 352 Clustering.clusterizeNaive(*SubtargetInfo, *InstrInfo); 353 } 354 355 return Clustering; 356 } 357 358 void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { 359 if (Representative.empty()) 360 Representative.resize(Point.size()); 361 assert(Representative.size() == Point.size() && 362 "All points should have identical dimensions."); 363 364 for (auto I : zip(Representative, Point)) 365 std::get<0>(I).push(std::get<1>(I)); 366 } 367 368 std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { 369 std::vector<BenchmarkMeasure> ClusterCenterPoint(Representative.size()); 370 for (auto I : zip(ClusterCenterPoint, Representative)) 371 std::get<0>(I).PerInstructionValue = std::get<1>(I).avg(); 372 return ClusterCenterPoint; 373 } 374 375 bool SchedClassClusterCentroid::validate( 376 InstructionBenchmark::ModeE Mode) const { 377 size_t NumMeasurements = Representative.size(); 378 switch (Mode) { 379 case InstructionBenchmark::Latency: 380 if (NumMeasurements != 1) { 381 errs() 382 << "invalid number of measurements in latency mode: expected 1, got " 383 << NumMeasurements << "\n"; 384 return false; 385 } 386 break; 387 case InstructionBenchmark::Uops: 388 // Can have many measurements. 389 break; 390 case InstructionBenchmark::InverseThroughput: 391 if (NumMeasurements != 1) { 392 errs() << "invalid number of measurements in inverse throughput " 393 "mode: expected 1, got " 394 << NumMeasurements << "\n"; 395 return false; 396 } 397 break; 398 default: 399 llvm_unreachable("unimplemented measurement matching mode"); 400 return false; 401 } 402 403 return true; // All good. 404 } 405 406 } // namespace exegesis 407 } // namespace llvm 408