1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Clustering.h" 10 #include "Error.h" 11 #include "llvm/ADT/SetVector.h" 12 #include "llvm/ADT/SmallSet.h" 13 #include "llvm/ADT/SmallVector.h" 14 #include <algorithm> 15 #include <string> 16 #include <vector> 17 #include <deque> 18 19 namespace llvm { 20 namespace exegesis { 21 22 // The clustering problem has the following characteristics: 23 // (A) - Low dimension (dimensions are typically proc resource units, 24 // typically < 10). 25 // (B) - Number of points : ~thousands (points are measurements of an MCInst) 26 // (C) - Number of clusters: ~tens. 27 // (D) - The number of clusters is not known /a priory/. 28 // (E) - The amount of noise is relatively small. 29 // The problem is rather small. In terms of algorithms, (D) disqualifies 30 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. 31 // 32 // We've used DBSCAN here because it's simple to implement. This is a pretty 33 // straightforward and inefficient implementation of the pseudocode in [2]. 34 // 35 // [1] https://en.wikipedia.org/wiki/DBSCAN 36 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm 37 38 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not 39 // including Q). 40 void InstructionBenchmarkClustering::rangeQuery( 41 const size_t Q, std::vector<size_t> &Neighbors) const { 42 Neighbors.clear(); 43 Neighbors.reserve(Points_.size() - 1); // The Q itself isn't a neighbor. 44 const auto &QMeasurements = Points_[Q].Measurements; 45 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 46 if (P == Q) 47 continue; 48 const auto &PMeasurements = Points_[P].Measurements; 49 if (PMeasurements.empty()) // Error point. 50 continue; 51 if (isNeighbour(PMeasurements, QMeasurements, 52 AnalysisClusteringEpsilonSquared_)) { 53 Neighbors.push_back(P); 54 } 55 } 56 } 57 58 // Given a set of points, checks that all the points are neighbours 59 // up to AnalysisClusteringEpsilon. This is O(2*N). 60 bool InstructionBenchmarkClustering::areAllNeighbours( 61 ArrayRef<size_t> Pts) const { 62 // First, get the centroid of this group of points. This is O(N). 63 SchedClassClusterCentroid G; 64 for_each(Pts, [this, &G](size_t P) { 65 assert(P < Points_.size()); 66 ArrayRef<BenchmarkMeasure> Measurements = Points_[P].Measurements; 67 if (Measurements.empty()) // Error point. 68 return; 69 G.addPoint(Measurements); 70 }); 71 const std::vector<BenchmarkMeasure> Centroid = G.getAsPoint(); 72 73 // Since we will be comparing with the centroid, we need to halve the epsilon. 74 double AnalysisClusteringEpsilonHalvedSquared = 75 AnalysisClusteringEpsilonSquared_ / 4.0; 76 77 // And now check that every point is a neighbour of the centroid. Also O(N). 78 return all_of( 79 Pts, [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { 80 assert(P < Points_.size()); 81 const auto &PMeasurements = Points_[P].Measurements; 82 if (PMeasurements.empty()) // Error point. 83 return true; // Pretend that error point is a neighbour. 84 return isNeighbour(PMeasurements, Centroid, 85 AnalysisClusteringEpsilonHalvedSquared); 86 }); 87 } 88 89 InstructionBenchmarkClustering::InstructionBenchmarkClustering( 90 const std::vector<InstructionBenchmark> &Points, 91 const double AnalysisClusteringEpsilonSquared) 92 : Points_(Points), 93 AnalysisClusteringEpsilonSquared_(AnalysisClusteringEpsilonSquared), 94 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} 95 96 Error InstructionBenchmarkClustering::validateAndSetup() { 97 ClusterIdForPoint_.resize(Points_.size()); 98 // Mark erroneous measurements out. 99 // All points must have the same number of dimensions, in the same order. 100 const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; 101 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 102 const auto &Point = Points_[P]; 103 if (!Point.Error.empty()) { 104 ClusterIdForPoint_[P] = ClusterId::error(); 105 ErrorCluster_.PointIndices.push_back(P); 106 continue; 107 } 108 const auto *CurMeasurement = &Point.Measurements; 109 if (LastMeasurement) { 110 if (LastMeasurement->size() != CurMeasurement->size()) { 111 return make_error<ClusteringError>( 112 "inconsistent measurement dimensions"); 113 } 114 for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { 115 if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { 116 return make_error<ClusteringError>( 117 "inconsistent measurement dimensions keys"); 118 } 119 } 120 } 121 LastMeasurement = CurMeasurement; 122 } 123 if (LastMeasurement) { 124 NumDimensions_ = LastMeasurement->size(); 125 } 126 return Error::success(); 127 } 128 129 void InstructionBenchmarkClustering::clusterizeDbScan(const size_t MinPts) { 130 std::vector<size_t> Neighbors; // Persistent buffer to avoid allocs. 131 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 132 if (!ClusterIdForPoint_[P].isUndef()) 133 continue; // Previously processed in inner loop. 134 rangeQuery(P, Neighbors); 135 if (Neighbors.size() + 1 < MinPts) { // Density check. 136 // The region around P is not dense enough to create a new cluster, mark 137 // as noise for now. 138 ClusterIdForPoint_[P] = ClusterId::noise(); 139 continue; 140 } 141 142 // Create a new cluster, add P. 143 Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); 144 Cluster &CurrentCluster = Clusters_.back(); 145 ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ 146 CurrentCluster.PointIndices.push_back(P); 147 148 // Process P's neighbors. 149 SetVector<size_t, std::deque<size_t>> ToProcess; 150 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 151 while (!ToProcess.empty()) { 152 // Retrieve a point from the set. 153 const size_t Q = *ToProcess.begin(); 154 ToProcess.erase(ToProcess.begin()); 155 156 if (ClusterIdForPoint_[Q].isNoise()) { 157 // Change noise point to border point. 158 ClusterIdForPoint_[Q] = CurrentCluster.Id; 159 CurrentCluster.PointIndices.push_back(Q); 160 continue; 161 } 162 if (!ClusterIdForPoint_[Q].isUndef()) { 163 continue; // Previously processed. 164 } 165 // Add Q to the current custer. 166 ClusterIdForPoint_[Q] = CurrentCluster.Id; 167 CurrentCluster.PointIndices.push_back(Q); 168 // And extend to the neighbors of Q if the region is dense enough. 169 rangeQuery(Q, Neighbors); 170 if (Neighbors.size() + 1 >= MinPts) { 171 ToProcess.insert(Neighbors.begin(), Neighbors.end()); 172 } 173 } 174 } 175 // assert(Neighbors.capacity() == (Points_.size() - 1)); 176 // ^ True, but it is not quaranteed to be true in all the cases. 177 178 // Add noisy points to noise cluster. 179 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 180 if (ClusterIdForPoint_[P].isNoise()) { 181 NoiseCluster_.PointIndices.push_back(P); 182 } 183 } 184 } 185 186 void InstructionBenchmarkClustering::clusterizeNaive(unsigned NumOpcodes) { 187 // Given an instruction Opcode, which are the benchmarks of this instruction? 188 std::vector<SmallVector<size_t, 1>> OpcodeToPoints; 189 OpcodeToPoints.resize(NumOpcodes); 190 size_t NumOpcodesSeen = 0; 191 for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { 192 const InstructionBenchmark &Point = Points_[P]; 193 const unsigned Opcode = Point.keyInstruction().getOpcode(); 194 assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); 195 SmallVectorImpl<size_t> &PointsOfOpcode = OpcodeToPoints[Opcode]; 196 if (PointsOfOpcode.empty()) // If we previously have not seen any points of 197 ++NumOpcodesSeen; // this opcode, then naturally this is the new opcode. 198 PointsOfOpcode.emplace_back(P); 199 } 200 assert(OpcodeToPoints.size() == NumOpcodes && "sanity check"); 201 assert(NumOpcodesSeen <= NumOpcodes && 202 "can't see more opcodes than there are total opcodes"); 203 assert(NumOpcodesSeen <= Points_.size() && 204 "can't see more opcodes than there are total points"); 205 206 Clusters_.reserve(NumOpcodesSeen); // One cluster per opcode. 207 for (ArrayRef<size_t> PointsOfOpcode : 208 make_filter_range(OpcodeToPoints, [](ArrayRef<size_t> PointsOfOpcode) { 209 return !PointsOfOpcode.empty(); // Ignore opcodes with no points. 210 })) { 211 // Create a new cluster. 212 Clusters_.emplace_back(ClusterId::makeValid( 213 Clusters_.size(), /*IsUnstable=*/!areAllNeighbours(PointsOfOpcode))); 214 Cluster &CurrentCluster = Clusters_.back(); 215 // Mark points as belonging to the new cluster. 216 for_each(PointsOfOpcode, [this, &CurrentCluster](size_t P) { 217 ClusterIdForPoint_[P] = CurrentCluster.Id; 218 }); 219 // And add all the points of this opcode to the new cluster. 220 CurrentCluster.PointIndices.reserve(PointsOfOpcode.size()); 221 CurrentCluster.PointIndices.assign(PointsOfOpcode.begin(), 222 PointsOfOpcode.end()); 223 assert(CurrentCluster.PointIndices.size() == PointsOfOpcode.size()); 224 } 225 assert(Clusters_.size() == NumOpcodesSeen); 226 } 227 228 // Given an instruction Opcode, we can make benchmarks (measurements) of the 229 // instruction characteristics/performance. Then, to facilitate further analysis 230 // we group the benchmarks with *similar* characteristics into clusters. 231 // Now, this is all not entirely deterministic. Some instructions have variable 232 // characteristics, depending on their arguments. And thus, if we do several 233 // benchmarks of the same instruction Opcode, we may end up with *different* 234 // performance characteristics measurements. And when we then do clustering, 235 // these several benchmarks of the same instruction Opcode may end up being 236 // clustered into *different* clusters. This is not great for further analysis. 237 // We shall find every opcode with benchmarks not in just one cluster, and move 238 // *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. 239 void InstructionBenchmarkClustering::stabilize(unsigned NumOpcodes) { 240 // Given an instruction Opcode and Config, in which clusters do benchmarks of 241 // this instruction lie? Normally, they all should be in the same cluster. 242 struct OpcodeAndConfig { 243 explicit OpcodeAndConfig(const InstructionBenchmark &IB) 244 : Opcode(IB.keyInstruction().getOpcode()), Config(&IB.Key.Config) {} 245 unsigned Opcode; 246 const std::string *Config; 247 248 auto Tie() const -> auto { return std::tie(Opcode, *Config); } 249 250 bool operator<(const OpcodeAndConfig &O) const { return Tie() < O.Tie(); } 251 bool operator!=(const OpcodeAndConfig &O) const { return Tie() != O.Tie(); } 252 }; 253 std::map<OpcodeAndConfig, SmallSet<ClusterId, 1>> OpcodeConfigToClusterIDs; 254 // Populate OpcodeConfigToClusterIDs and UnstableOpcodes data structures. 255 assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch"); 256 for (auto Point : zip(Points_, ClusterIdForPoint_)) { 257 const ClusterId &ClusterIdOfPoint = std::get<1>(Point); 258 if (!ClusterIdOfPoint.isValid()) 259 continue; // Only process fully valid clusters. 260 const OpcodeAndConfig Key(std::get<0>(Point)); 261 SmallSet<ClusterId, 1> &ClusterIDsOfOpcode = OpcodeConfigToClusterIDs[Key]; 262 ClusterIDsOfOpcode.insert(ClusterIdOfPoint); 263 } 264 265 for (const auto &OpcodeConfigToClusterID : OpcodeConfigToClusterIDs) { 266 const SmallSet<ClusterId, 1> &ClusterIDs = OpcodeConfigToClusterID.second; 267 const OpcodeAndConfig &Key = OpcodeConfigToClusterID.first; 268 // We only care about unstable instructions. 269 if (ClusterIDs.size() < 2) 270 continue; 271 272 // Create a new unstable cluster, one per Opcode. 273 Clusters_.emplace_back(ClusterId::makeValidUnstable(Clusters_.size())); 274 Cluster &UnstableCluster = Clusters_.back(); 275 // We will find *at least* one point in each of these clusters. 276 UnstableCluster.PointIndices.reserve(ClusterIDs.size()); 277 278 // Go through every cluster which we recorded as containing benchmarks 279 // of this UnstableOpcode. NOTE: we only recorded valid clusters. 280 for (const ClusterId &CID : ClusterIDs) { 281 assert(CID.isValid() && 282 "We only recorded valid clusters, not noise/error clusters."); 283 Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. 284 // Within each cluster, go through each point, and either move it to the 285 // new unstable cluster, or 'keep' it. 286 // In this case, we'll reshuffle OldCluster.PointIndices vector 287 // so that all the points that are *not* for UnstableOpcode are first, 288 // and the rest of the points is for the UnstableOpcode. 289 const auto it = std::stable_partition( 290 OldCluster.PointIndices.begin(), OldCluster.PointIndices.end(), 291 [this, &Key](size_t P) { 292 return OpcodeAndConfig(Points_[P]) != Key; 293 }); 294 assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && 295 "Should have found at least one bad point"); 296 // Mark to-be-moved points as belonging to the new cluster. 297 std::for_each(it, OldCluster.PointIndices.end(), 298 [this, &UnstableCluster](size_t P) { 299 ClusterIdForPoint_[P] = UnstableCluster.Id; 300 }); 301 // Actually append to-be-moved points to the new cluster. 302 UnstableCluster.PointIndices.insert(UnstableCluster.PointIndices.end(), 303 it, OldCluster.PointIndices.end()); 304 // And finally, remove "to-be-moved" points form the old cluster. 305 OldCluster.PointIndices.erase(it, OldCluster.PointIndices.end()); 306 // Now, the old cluster may end up being empty, but let's just keep it 307 // in whatever state it ended up. Purging empty clusters isn't worth it. 308 }; 309 assert(UnstableCluster.PointIndices.size() > 1 && 310 "New unstable cluster should end up with more than one point."); 311 assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && 312 "New unstable cluster should end up with no less points than there " 313 "was clusters"); 314 } 315 } 316 317 Expected<InstructionBenchmarkClustering> InstructionBenchmarkClustering::create( 318 const std::vector<InstructionBenchmark> &Points, const ModeE Mode, 319 const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, 320 Optional<unsigned> NumOpcodes) { 321 InstructionBenchmarkClustering Clustering( 322 Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); 323 if (auto Error = Clustering.validateAndSetup()) { 324 return std::move(Error); 325 } 326 if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { 327 return Clustering; // Nothing to cluster. 328 } 329 330 if (Mode == ModeE::Dbscan) { 331 Clustering.clusterizeDbScan(DbscanMinPts); 332 333 if (NumOpcodes.hasValue()) 334 Clustering.stabilize(NumOpcodes.getValue()); 335 } else /*if(Mode == ModeE::Naive)*/ { 336 if (!NumOpcodes.hasValue()) 337 return make_error<Failure>( 338 "'naive' clustering mode requires opcode count to be specified"); 339 Clustering.clusterizeNaive(NumOpcodes.getValue()); 340 } 341 342 return Clustering; 343 } 344 345 void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { 346 if (Representative.empty()) 347 Representative.resize(Point.size()); 348 assert(Representative.size() == Point.size() && 349 "All points should have identical dimensions."); 350 351 for (auto I : zip(Representative, Point)) 352 std::get<0>(I).push(std::get<1>(I)); 353 } 354 355 std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { 356 std::vector<BenchmarkMeasure> ClusterCenterPoint(Representative.size()); 357 for (auto I : zip(ClusterCenterPoint, Representative)) 358 std::get<0>(I).PerInstructionValue = std::get<1>(I).avg(); 359 return ClusterCenterPoint; 360 } 361 362 bool SchedClassClusterCentroid::validate( 363 InstructionBenchmark::ModeE Mode) const { 364 size_t NumMeasurements = Representative.size(); 365 switch (Mode) { 366 case InstructionBenchmark::Latency: 367 if (NumMeasurements != 1) { 368 errs() 369 << "invalid number of measurements in latency mode: expected 1, got " 370 << NumMeasurements << "\n"; 371 return false; 372 } 373 break; 374 case InstructionBenchmark::Uops: 375 // Can have many measurements. 376 break; 377 case InstructionBenchmark::InverseThroughput: 378 if (NumMeasurements != 1) { 379 errs() << "invalid number of measurements in inverse throughput " 380 "mode: expected 1, got " 381 << NumMeasurements << "\n"; 382 return false; 383 } 384 break; 385 default: 386 llvm_unreachable("unimplemented measurement matching mode"); 387 return false; 388 } 389 390 return true; // All good. 391 } 392 393 } // namespace exegesis 394 } // namespace llvm 395