1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Clustering.h" 10 #include "llvm/ADT/SetVector.h" 11 #include "llvm/ADT/SmallSet.h" 12 #include "llvm/ADT/SmallVector.h" 13 #include <algorithm> 14 #include <string> 15 #include <vector> 16 17 namespace llvm { 18 namespace exegesis { 19 20 // The clustering problem has the following characteristics: 21 // (A) - Low dimension (dimensions are typically proc resource units, 22 // typically < 10). 23 // (B) - Number of points : ~thousands (points are measurements of an MCInst) 24 // (C) - Number of clusters: ~tens. 25 // (D) - The number of clusters is not known /a priory/. 26 // (E) - The amount of noise is relatively small. 27 // The problem is rather small. In terms of algorithms, (D) disqualifies 28 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. 29 // 30 // We've used DBSCAN here because it's simple to implement. This is a pretty 31 // straightforward and inefficient implementation of the pseudocode in [2]. 32 // 33 // [1] https://en.wikipedia.org/wiki/DBSCAN 34 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm 35 36 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not 37 // including Q). 38 void InstructionBenchmarkClustering::rangeQuery( 39 const size_t Q, std::vector<size_t> &Neighbors) const { 40 Neighbors.clear(); 41 Neighbors.reserve(Points_.size() - 1); // The Q itself isn't a neighbor. 42 const auto &QMeasurements = Points_[Q].Measurements; 43 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 44 if (P == Q) 45 continue; 46 const auto &PMeasurements = Points_[P].Measurements; 47 if (PMeasurements.empty()) // Error point. 48 continue; 49 if (isNeighbour(PMeasurements, QMeasurements, 50 AnalysisClusteringEpsilonSquared_)) { 51 Neighbors.push_back(P); 52 } 53 } 54 } 55 56 // Given a set of points, checks that all the points are neighbours 57 // up to AnalysisClusteringEpsilon. This is O(2*N). 58 bool InstructionBenchmarkClustering::areAllNeighbours( 59 ArrayRef<size_t> Pts) const { 60 // First, get the centroid of this group of points. This is O(N). 61 SchedClassClusterCentroid G; 62 for_each(Pts, [this, &G](size_t P) { 63 assert(P < Points_.size()); 64 ArrayRef<BenchmarkMeasure> Measurements = Points_[P].Measurements; 65 if (Measurements.empty()) // Error point. 66 return; 67 G.addPoint(Measurements); 68 }); 69 const std::vector<BenchmarkMeasure> Centroid = G.getAsPoint(); 70 71 // Since we will be comparing with the centroid, we need to halve the epsilon. 72 double AnalysisClusteringEpsilonHalvedSquared = 73 AnalysisClusteringEpsilonSquared_ / 4.0; 74 75 // And now check that every point is a neighbour of the centroid. Also O(N). 76 return all_of( 77 Pts, [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { 78 assert(P < Points_.size()); 79 const auto &PMeasurements = Points_[P].Measurements; 80 if (PMeasurements.empty()) // Error point. 81 return true; // Pretend that error point is a neighbour. 82 return isNeighbour(PMeasurements, Centroid, 83 AnalysisClusteringEpsilonHalvedSquared); 84 }); 85 } 86 87 InstructionBenchmarkClustering::InstructionBenchmarkClustering( 88 const std::vector<InstructionBenchmark> &Points, 89 const double AnalysisClusteringEpsilonSquared) 90 : Points_(Points), 91 AnalysisClusteringEpsilonSquared_(AnalysisClusteringEpsilonSquared), 92 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} 93 94 Error InstructionBenchmarkClustering::validateAndSetup() { 95 ClusterIdForPoint_.resize(Points_.size()); 96 // Mark erroneous measurements out. 97 // All points must have the same number of dimensions, in the same order. 98 const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; 99 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 100 const auto &Point = Points_[P]; 101 if (!Point.Error.empty()) { 102 ClusterIdForPoint_[P] = ClusterId::error(); 103 ErrorCluster_.PointIndices.push_back(P); 104 continue; 105 } 106 const auto *CurMeasurement = &Point.Measurements; 107 if (LastMeasurement) { 108 if (LastMeasurement->size() != CurMeasurement->size()) { 109 return make_error<StringError>("inconsistent measurement dimensions", 110 inconvertibleErrorCode()); 111 } 112 for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { 113 if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { 114 return make_error<StringError>( 115 "inconsistent measurement dimensions keys", 116 inconvertibleErrorCode()); 117 } 118 } 119 } 120 LastMeasurement = CurMeasurement; 121 } 122 if (LastMeasurement) { 123 NumDimensions_ = LastMeasurement->size(); 124 } 125 return Error::success(); 126 } 127 128 void InstructionBenchmarkClustering::clusterizeDbScan(const size_t MinPts) { 129 std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs. 130 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 131 if (!ClusterIdForPoint_[P].isUndef()) 132 continue; // Previously processed in inner loop. 133 rangeQuery(P, Neighbors); 134 if (Neighbors.size() + 1 < MinPts) { // Density check. 135 // The region around P is not dense enough to create a new cluster, mark 136 // as noise for now. 137 ClusterIdForPoint_[P] = ClusterId::noise(); 138 continue; 139 } 140 141 // Create a new cluster, add P. 142 Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); 143 Cluster &CurrentCluster = Clusters_.back(); 144 ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ 145 CurrentCluster.PointIndices.push_back(P); 146 147 // Process P's neighbors. 148 SetVector<size_t, std::deque<size_t>> ToProcess; 149 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 150 while (!ToProcess.empty()) { 151 // Retrieve a point from the set. 152 const size_t Q = *ToProcess.begin(); 153 ToProcess.erase(ToProcess.begin()); 154 155 if (ClusterIdForPoint_[Q].isNoise()) { 156 // Change noise point to border point. 157 ClusterIdForPoint_[Q] = CurrentCluster.Id; 158 CurrentCluster.PointIndices.push_back(Q); 159 continue; 160 } 161 if (!ClusterIdForPoint_[Q].isUndef()) { 162 continue; // Previously processed. 163 } 164 // Add Q to the current custer. 165 ClusterIdForPoint_[Q] = CurrentCluster.Id; 166 CurrentCluster.PointIndices.push_back(Q); 167 // And extend to the neighbors of Q if the region is dense enough. 168 rangeQuery(Q, Neighbors); 169 if (Neighbors.size() + 1 >= MinPts) { 170 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 171 } 172 } 173 } 174 // assert(Neighbors.capacity() == (Points_.size() - 1)); 175 // ^ True, but it is not quaranteed to be true in all the cases. 176 177 // Add noisy points to noise cluster. 178 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 179 if (ClusterIdForPoint_[P].isNoise()) { 180 NoiseCluster_.PointIndices.push_back(P); 181 } 182 } 183 } 184 185 void InstructionBenchmarkClustering::clusterizeNaive(unsigned NumOpcodes) { 186 // Given an instruction Opcode, which are the benchmarks of this instruction? 187 std::vector<SmallVector<size_t, 1>> OpcodeToPoints; 188 OpcodeToPoints.resize(NumOpcodes); 189 size_t NumOpcodesSeen = 0; 190 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 191 const InstructionBenchmark &Point = Points_[P]; 192 const unsigned Opcode = Point.keyInstruction().getOpcode(); 193 assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); 194 SmallVectorImpl<size_t> &PointsOfOpcode = OpcodeToPoints[Opcode]; 195 if (PointsOfOpcode.empty()) // If we previously have not seen any points of 196 ++NumOpcodesSeen; // this opcode, then naturally this is the new opcode. 197 PointsOfOpcode.emplace_back(P); 198 } 199 assert(OpcodeToPoints.size() == NumOpcodes && "sanity check"); 200 assert(NumOpcodesSeen <= NumOpcodes && 201 "can't see more opcodes than there are total opcodes"); 202 assert(NumOpcodesSeen <= Points_.size() && 203 "can't see more opcodes than there are total points"); 204 205 Clusters_.reserve(NumOpcodesSeen); // One cluster per opcode. 206 for (ArrayRef<size_t> PointsOfOpcode : 207 make_filter_range(OpcodeToPoints, [](ArrayRef<size_t> PointsOfOpcode) { 208 return !PointsOfOpcode.empty(); // Ignore opcodes with no points. 209 })) { 210 // Create a new cluster. 211 Clusters_.emplace_back(ClusterId::makeValid( 212 Clusters_.size(), /*IsUnstable=*/!areAllNeighbours(PointsOfOpcode))); 213 Cluster &CurrentCluster = Clusters_.back(); 214 // Mark points as belonging to the new cluster. 215 for_each(PointsOfOpcode, [this, &CurrentCluster](size_t P) { 216 ClusterIdForPoint_[P] = CurrentCluster.Id; 217 }); 218 // And add all the points of this opcode to the new cluster. 219 CurrentCluster.PointIndices.reserve(PointsOfOpcode.size()); 220 CurrentCluster.PointIndices.assign(PointsOfOpcode.begin(), 221 PointsOfOpcode.end()); 222 assert(CurrentCluster.PointIndices.size() == PointsOfOpcode.size()); 223 } 224 assert(Clusters_.size() == NumOpcodesSeen); 225 } 226 227 // Given an instruction Opcode, we can make benchmarks (measurements) of the 228 // instruction characteristics/performance. Then, to facilitate further analysis 229 // we group the benchmarks with *similar* characteristics into clusters. 230 // Now, this is all not entirely deterministic. Some instructions have variable 231 // characteristics, depending on their arguments. And thus, if we do several 232 // benchmarks of the same instruction Opcode, we may end up with *different* 233 // performance characteristics measurements. And when we then do clustering, 234 // these several benchmarks of the same instruction Opcode may end up being 235 // clustered into *different* clusters. This is not great for further analysis. 236 // We shall find every opcode with benchmarks not in just one cluster, and move 237 // *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. 238 void InstructionBenchmarkClustering::stabilize(unsigned NumOpcodes) { 239 // Given an instruction Opcode and Config, in which clusters do benchmarks of 240 // this instruction lie? Normally, they all should be in the same cluster. 241 struct OpcodeAndConfig { 242 explicit OpcodeAndConfig(const InstructionBenchmark &IB) 243 : Opcode(IB.keyInstruction().getOpcode()), Config(&IB.Key.Config) {} 244 unsigned Opcode; 245 const std::string *Config; 246 247 auto Tie() const -> auto { return std::tie(Opcode, *Config); } 248 249 bool operator<(const OpcodeAndConfig &O) const { return Tie() < O.Tie(); } 250 bool operator!=(const OpcodeAndConfig &O) const { return Tie() != O.Tie(); } 251 }; 252 std::map<OpcodeAndConfig, SmallSet<ClusterId, 1>> OpcodeConfigToClusterIDs; 253 // Populate OpcodeConfigToClusterIDs and UnstableOpcodes data structures. 254 assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch"); 255 for (auto Point : zip(Points_, ClusterIdForPoint_)) { 256 const ClusterId &ClusterIdOfPoint = std::get<1>(Point); 257 if (!ClusterIdOfPoint.isValid()) 258 continue; // Only process fully valid clusters. 259 const OpcodeAndConfig Key(std::get<0>(Point)); 260 SmallSet<ClusterId, 1> &ClusterIDsOfOpcode = OpcodeConfigToClusterIDs[Key]; 261 ClusterIDsOfOpcode.insert(ClusterIdOfPoint); 262 } 263 264 for (const auto &OpcodeConfigToClusterID : OpcodeConfigToClusterIDs) { 265 const SmallSet<ClusterId, 1> &ClusterIDs = OpcodeConfigToClusterID.second; 266 const OpcodeAndConfig &Key = OpcodeConfigToClusterID.first; 267 // We only care about unstable instructions. 268 if (ClusterIDs.size() < 2) 269 continue; 270 271 // Create a new unstable cluster, one per Opcode. 272 Clusters_.emplace_back(ClusterId::makeValidUnstable(Clusters_.size())); 273 Cluster &UnstableCluster = Clusters_.back(); 274 // We will find *at least* one point in each of these clusters. 275 UnstableCluster.PointIndices.reserve(ClusterIDs.size()); 276 277 // Go through every cluster which we recorded as containing benchmarks 278 // of this UnstableOpcode. NOTE: we only recorded valid clusters. 279 for (const ClusterId &CID : ClusterIDs) { 280 assert(CID.isValid() && 281 "We only recorded valid clusters, not noise/error clusters."); 282 Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. 283 // Within each cluster, go through each point, and either move it to the 284 // new unstable cluster, or 'keep' it. 285 // In this case, we'll reshuffle OldCluster.PointIndices vector 286 // so that all the points that are *not* for UnstableOpcode are first, 287 // and the rest of the points is for the UnstableOpcode. 288 const auto it = std::stable_partition( 289 OldCluster.PointIndices.begin(), OldCluster.PointIndices.end(), 290 [this, &Key](size_t P) { 291 return OpcodeAndConfig(Points_[P]) != Key; 292 }); 293 assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && 294 "Should have found at least one bad point"); 295 // Mark to-be-moved points as belonging to the new cluster. 296 std::for_each(it, OldCluster.PointIndices.end(), 297 [this, &UnstableCluster](size_t P) { 298 ClusterIdForPoint_[P] = UnstableCluster.Id; 299 }); 300 // Actually append to-be-moved points to the new cluster. 301 UnstableCluster.PointIndices.insert(UnstableCluster.PointIndices.end(), 302 it, OldCluster.PointIndices.end()); 303 // And finally, remove "to-be-moved" points form the old cluster. 304 OldCluster.PointIndices.erase(it, OldCluster.PointIndices.end()); 305 // Now, the old cluster may end up being empty, but let's just keep it 306 // in whatever state it ended up. Purging empty clusters isn't worth it. 307 }; 308 assert(UnstableCluster.PointIndices.size() > 1 && 309 "New unstable cluster should end up with more than one point."); 310 assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && 311 "New unstable cluster should end up with no less points than there " 312 "was clusters"); 313 } 314 } 315 316 Expected<InstructionBenchmarkClustering> InstructionBenchmarkClustering::create( 317 const std::vector<InstructionBenchmark> &Points, const ModeE Mode, 318 const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, 319 Optional<unsigned> NumOpcodes) { 320 InstructionBenchmarkClustering Clustering( 321 Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); 322 if (auto Error = Clustering.validateAndSetup()) { 323 return std::move(Error); 324 } 325 if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { 326 return Clustering; // Nothing to cluster. 327 } 328 329 if (Mode == ModeE::Dbscan) { 330 Clustering.clusterizeDbScan(DbscanMinPts); 331 332 if (NumOpcodes.hasValue()) 333 Clustering.stabilize(NumOpcodes.getValue()); 334 } else /*if(Mode == ModeE::Naive)*/ { 335 if (!NumOpcodes.hasValue()) 336 report_fatal_error( 337 "'naive' clustering mode requires opcode count to be specified"); 338 Clustering.clusterizeNaive(NumOpcodes.getValue()); 339 } 340 341 return Clustering; 342 } 343 344 void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { 345 if (Representative.empty()) 346 Representative.resize(Point.size()); 347 assert(Representative.size() == Point.size() && 348 "All points should have identical dimensions."); 349 350 for (auto I : zip(Representative, Point)) 351 std::get<0>(I).push(std::get<1>(I)); 352 } 353 354 std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { 355 std::vector<BenchmarkMeasure> ClusterCenterPoint(Representative.size()); 356 for (auto I : zip(ClusterCenterPoint, Representative)) 357 std::get<0>(I).PerInstructionValue = std::get<1>(I).avg(); 358 return ClusterCenterPoint; 359 } 360 361 bool SchedClassClusterCentroid::validate( 362 InstructionBenchmark::ModeE Mode) const { 363 size_t NumMeasurements = Representative.size(); 364 switch (Mode) { 365 case InstructionBenchmark::Latency: 366 if (NumMeasurements != 1) { 367 errs() 368 << "invalid number of measurements in latency mode: expected 1, got " 369 << NumMeasurements << "\n"; 370 return false; 371 } 372 break; 373 case InstructionBenchmark::Uops: 374 // Can have many measurements. 375 break; 376 case InstructionBenchmark::InverseThroughput: 377 if (NumMeasurements != 1) { 378 errs() << "invalid number of measurements in inverse throughput " 379 "mode: expected 1, got " 380 << NumMeasurements << "\n"; 381 return false; 382 } 383 break; 384 default: 385 llvm_unreachable("unimplemented measurement matching mode"); 386 return false; 387 } 388 389 return true; // All good. 390 } 391 392 } // namespace exegesis 393 } // namespace llvm 394