1 //===-- Benchmark function --------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 // This file mainly defines a `Benchmark` function. 10 // 11 // The benchmarking process is as follows: 12 // - We start by measuring the time it takes to run the function 13 // `InitialIterations` times. This is called a Sample. From this we can derive 14 // the time it took to run a single iteration. 15 // 16 // - We repeat the previous step with a greater number of iterations to lower 17 // the impact of the measurement. We can derive a more precise estimation of the 18 // runtime for a single iteration. 19 // 20 // - Each sample gives a more accurate estimation of the runtime for a single 21 // iteration but also takes more time to run. We stop the process when: 22 // * The measure stabilize under a certain precision (Epsilon), 23 // * The overall benchmarking time is greater than MaxDuration, 24 // * The overall sample count is greater than MaxSamples, 25 // * The last sample used more than MaxIterations iterations. 26 // 27 // - We also makes sure that the benchmark doesn't run for a too short period of 28 // time by defining MinDuration and MinSamples. 29 30 #ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H 31 #define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H 32 33 #include "benchmark/benchmark.h" 34 #include "llvm/ADT/ArrayRef.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include <array> 37 #include <chrono> 38 #include <cmath> 39 #include <cstdint> 40 #include <optional> 41 42 namespace llvm { 43 namespace libc_benchmarks { 44 45 using Duration = std::chrono::duration<double>; 46 47 enum class BenchmarkLog { 48 None, // Don't keep the internal state of the benchmark. 49 Last, // Keep only the last batch. 50 Full // Keep all iterations states, useful for testing or debugging. 51 }; 52 53 // An object to configure the benchmark stopping conditions. 54 // See documentation at the beginning of the file for the overall algorithm and 55 // meaning of each field. 56 struct BenchmarkOptions { 57 // The minimum time for which the benchmark is running. 58 Duration MinDuration = std::chrono::seconds(0); 59 // The maximum time for which the benchmark is running. 60 Duration MaxDuration = std::chrono::seconds(10); 61 // The number of iterations in the first sample. 62 uint32_t InitialIterations = 1; 63 // The maximum number of iterations for any given sample. 64 uint32_t MaxIterations = 10000000; 65 // The minimum number of samples. 66 uint32_t MinSamples = 4; 67 // The maximum number of samples. 68 uint32_t MaxSamples = 1000; 69 // The benchmark will stop if the relative difference between the current and 70 // the last estimation is less than epsilon. This is 1% by default. 71 double Epsilon = 0.01; 72 // The number of iterations grows exponentially between each sample. 73 // Must be greater or equal to 1. 74 double ScalingFactor = 1.4; 75 BenchmarkLog Log = BenchmarkLog::None; 76 }; 77 78 // The state of a benchmark. 79 enum class BenchmarkStatus { 80 Running, 81 MaxDurationReached, 82 MaxIterationsReached, 83 MaxSamplesReached, 84 PrecisionReached, 85 }; 86 87 // The internal state of the benchmark, useful to debug, test or report 88 // statistics. 89 struct BenchmarkState { 90 size_t LastSampleIterations; 91 Duration LastBatchElapsed; 92 BenchmarkStatus CurrentStatus; 93 Duration CurrentBestGuess; // The time estimation for a single run of `foo`. 94 double ChangeRatio; // The change in time estimation between previous and 95 // current samples. 96 }; 97 98 // A lightweight result for a benchmark. 99 struct BenchmarkResult { 100 BenchmarkStatus TerminationStatus = BenchmarkStatus::Running; 101 Duration BestGuess = {}; 102 std::optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog; 103 }; 104 105 // Stores information about a cache in the host memory system. 106 struct CacheInfo { 107 std::string Type; // e.g. "Instruction", "Data", "Unified". 108 int Level; // 0 is closest to processing unit. 109 int Size; // In bytes. 110 int NumSharing; // The number of processing units (Hyper-Threading Thread) 111 // with which this cache is shared. 112 }; 113 114 // Stores information about the host. 115 struct HostState { 116 std::string CpuName; // returns a string compatible with the -march option. 117 double CpuFrequency; // in Hertz. 118 std::vector<CacheInfo> Caches; 119 120 static HostState get(); 121 }; 122 123 namespace internal { 124 125 struct Measurement { 126 size_t Iterations = 0; 127 Duration Elapsed = {}; 128 }; 129 130 // Updates the estimation of the elapsed time for a single iteration. 131 class RefinableRuntimeEstimation { 132 Duration TotalTime = {}; 133 size_t TotalIterations = 0; 134 135 public: 136 Duration update(const Measurement &M) { 137 assert(M.Iterations > 0); 138 // Duration is encoded as a double (see definition). 139 // `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect 140 // loss of precision due to radically different scales. 141 TotalTime += M.Elapsed; 142 TotalIterations += M.Iterations; 143 return TotalTime / TotalIterations; 144 } 145 }; 146 147 // This class tracks the progression of the runtime estimation. 148 class RuntimeEstimationProgression { 149 RefinableRuntimeEstimation RRE; 150 151 public: 152 Duration CurrentEstimation = {}; 153 154 // Returns the change ratio between our best guess so far and the one from the 155 // new measurement. 156 double computeImprovement(const Measurement &M) { 157 const Duration NewEstimation = RRE.update(M); 158 const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0)); 159 CurrentEstimation = NewEstimation; 160 return Ratio; 161 } 162 }; 163 164 } // namespace internal 165 166 // Measures the runtime of `foo` until conditions defined by `Options` are met. 167 // 168 // To avoid measurement's imprecisions we measure batches of `foo`. 169 // The batch size is growing by `ScalingFactor` to minimize the effect of 170 // measuring. 171 // 172 // Note: The benchmark is not responsible for serializing the executions of 173 // `foo`. It is not suitable for measuring, very small & side effect free 174 // functions, as the processor is free to execute several executions in 175 // parallel. 176 // 177 // - Options: A set of parameters controlling the stopping conditions for the 178 // benchmark. 179 // - foo: The function under test. It takes one value and returns one value. 180 // The input value is used to randomize the execution of `foo` as part of a 181 // batch to mitigate the effect of the branch predictor. Signature: 182 // `ProductType foo(ParameterProvider::value_type value);` 183 // The output value is a product of the execution of `foo` and prevents the 184 // compiler from optimizing out foo's body. 185 // - ParameterProvider: An object responsible for providing a range of 186 // `Iterations` values to use as input for `foo`. The `value_type` of the 187 // returned container has to be compatible with `foo` argument. 188 // Must implement one of: 189 // `Container<ParameterType> generateBatch(size_t Iterations);` 190 // `const Container<ParameterType>& generateBatch(size_t Iterations);` 191 // - Clock: An object providing the current time. Must implement: 192 // `std::chrono::time_point now();` 193 template <typename Function, typename ParameterProvider, 194 typename BenchmarkClock = const std::chrono::high_resolution_clock> 195 BenchmarkResult benchmark(const BenchmarkOptions &Options, 196 ParameterProvider &PP, Function foo, 197 BenchmarkClock &Clock = BenchmarkClock()) { 198 BenchmarkResult Result; 199 internal::RuntimeEstimationProgression REP; 200 Duration TotalBenchmarkDuration = {}; 201 size_t Iterations = std::max(Options.InitialIterations, uint32_t(1)); 202 size_t Samples = 0; 203 if (Options.ScalingFactor < 1.0) 204 report_fatal_error("ScalingFactor should be >= 1"); 205 if (Options.Log != BenchmarkLog::None) 206 Result.MaybeBenchmarkLog.emplace(); 207 for (;;) { 208 // Request a new Batch of size `Iterations`. 209 const auto &Batch = PP.generateBatch(Iterations); 210 211 // Measuring this Batch. 212 const auto StartTime = Clock.now(); 213 for (const auto Parameter : Batch) { 214 auto Production = foo(Parameter); 215 benchmark::DoNotOptimize(Production); 216 } 217 const auto EndTime = Clock.now(); 218 const Duration Elapsed = EndTime - StartTime; 219 220 // Updating statistics. 221 ++Samples; 222 TotalBenchmarkDuration += Elapsed; 223 const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed}); 224 Result.BestGuess = REP.CurrentEstimation; 225 226 // Stopping condition. 227 if (TotalBenchmarkDuration >= Options.MinDuration && 228 Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) 229 Result.TerminationStatus = BenchmarkStatus::PrecisionReached; 230 else if (Samples >= Options.MaxSamples) 231 Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached; 232 else if (TotalBenchmarkDuration >= Options.MaxDuration) 233 Result.TerminationStatus = BenchmarkStatus::MaxDurationReached; 234 else if (Iterations >= Options.MaxIterations) 235 Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached; 236 237 if (Result.MaybeBenchmarkLog) { 238 auto &BenchmarkLog = *Result.MaybeBenchmarkLog; 239 if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty()) 240 BenchmarkLog.pop_back(); 241 BenchmarkState BS; 242 BS.LastSampleIterations = Iterations; 243 BS.LastBatchElapsed = Elapsed; 244 BS.CurrentStatus = Result.TerminationStatus; 245 BS.CurrentBestGuess = Result.BestGuess; 246 BS.ChangeRatio = ChangeRatio; 247 BenchmarkLog.push_back(BS); 248 } 249 250 if (Result.TerminationStatus != BenchmarkStatus::Running) 251 return Result; 252 253 if (Options.ScalingFactor > 1 && 254 Iterations * Options.ScalingFactor == Iterations) 255 report_fatal_error( 256 "`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor " 257 "or InitialIterations."); 258 259 Iterations *= Options.ScalingFactor; 260 } 261 } 262 263 // Interprets `Array` as a circular buffer of `Size` elements. 264 template <typename T> class CircularArrayRef { 265 llvm::ArrayRef<T> Array; 266 size_t Size; 267 268 public: 269 using value_type = T; 270 using reference = T &; 271 using const_reference = const T &; 272 using difference_type = ssize_t; 273 using size_type = size_t; 274 275 class const_iterator { 276 using iterator_category = std::input_iterator_tag; 277 llvm::ArrayRef<T> Array; 278 size_t Index; 279 size_t Offset; 280 281 public: 282 explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0) 283 : Array(Array), Index(Index), Offset(Index % Array.size()) {} 284 const_iterator &operator++() { 285 ++Index; 286 ++Offset; 287 if (Offset == Array.size()) 288 Offset = 0; 289 return *this; 290 } 291 bool operator==(const_iterator Other) const { return Index == Other.Index; } 292 bool operator!=(const_iterator Other) const { return !(*this == Other); } 293 const T &operator*() const { return Array[Offset]; } 294 }; 295 296 CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size) 297 : Array(Array), Size(Size) { 298 assert(Array.size() > 0); 299 } 300 301 const_iterator begin() const { return const_iterator(Array); } 302 const_iterator end() const { return const_iterator(Array, Size); } 303 }; 304 305 // A convenient helper to produce a CircularArrayRef from an ArrayRef. 306 template <typename T> 307 CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) { 308 return {Array, Size}; 309 } 310 311 // Creates an std::array which storage size is constrained under `Bytes`. 312 template <typename T, size_t Bytes> 313 using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>; 314 315 // A convenient helper to produce a CircularArrayRef from a 316 // ByteConstrainedArray. 317 template <typename T, size_t N> 318 CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) { 319 return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size}; 320 } 321 322 // Makes sure the binary was compiled in release mode and that frequency 323 // governor is set on performance. 324 void checkRequirements(); 325 326 } // namespace libc_benchmarks 327 } // namespace llvm 328 329 #endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H 330