1 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H 2 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H 3 4 #include "benchmarks/gpu/BenchmarkLogger.h" 5 #include "benchmarks/gpu/timing/timing.h" 6 #include "src/__support/CPP/array.h" 7 #include "src/__support/CPP/functional.h" 8 #include "src/__support/CPP/limits.h" 9 #include "src/__support/CPP/string_view.h" 10 #include "src/__support/CPP/type_traits.h" 11 #include "src/__support/FPUtil/FPBits.h" 12 #include "src/__support/macros/config.h" 13 #include "src/stdlib/rand.h" 14 #include "src/time/clock.h" 15 16 #include <stdint.h> 17 18 namespace LIBC_NAMESPACE_DECL { 19 20 namespace benchmarks { 21 22 struct BenchmarkOptions { 23 uint32_t initial_iterations = 1; 24 uint32_t min_iterations = 1; 25 uint32_t max_iterations = 10000000; 26 uint32_t min_samples = 4; 27 uint32_t max_samples = 1000; 28 int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us 29 int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second 30 double epsilon = 0.0001; 31 double scaling_factor = 1.4; 32 }; 33 34 struct Measurement { 35 uint32_t iterations = 0; 36 uint64_t elapsed_cycles = 0; 37 }; 38 39 class RefinableRuntimeEstimation { 40 uint64_t total_cycles = 0; 41 uint32_t total_iterations = 0; 42 43 public: 44 uint64_t update(const Measurement &M) { 45 total_cycles += M.elapsed_cycles; 46 total_iterations += M.iterations; 47 return total_cycles / total_iterations; 48 } 49 }; 50 51 // Tracks the progression of the runtime estimation 52 class RuntimeEstimationProgression { 53 RefinableRuntimeEstimation rre; 54 55 public: 56 uint64_t current_estimation = 0; 57 58 double compute_improvement(const Measurement &M) { 59 const uint64_t new_estimation = rre.update(M); 60 double ratio = 61 (static_cast<double>(current_estimation) / new_estimation) - 1.0; 62 63 // Get absolute value 64 if (ratio < 0) 65 ratio *= -1; 66 67 current_estimation = new_estimation; 68 return ratio; 69 } 70 }; 71 72 struct BenchmarkResult { 73 uint64_t cycles = 0; 74 double standard_deviation = 0; 75 uint64_t min = UINT64_MAX; 76 uint64_t max = 0; 77 uint32_t samples = 0; 78 uint32_t total_iterations = 0; 79 clock_t total_time = 0; 80 }; 81 82 BenchmarkResult benchmark(const BenchmarkOptions &options, 83 cpp::function<uint64_t(void)> wrapper_func); 84 85 class Benchmark { 86 const cpp::function<uint64_t(void)> func; 87 const cpp::string_view suite_name; 88 const cpp::string_view test_name; 89 const uint32_t num_threads; 90 91 public: 92 Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name, 93 char const *test_name, uint32_t num_threads) 94 : func(func), suite_name(suite_name), test_name(test_name), 95 num_threads(num_threads) { 96 add_benchmark(this); 97 } 98 99 static void run_benchmarks(); 100 const cpp::string_view get_suite_name() const { return suite_name; } 101 const cpp::string_view get_test_name() const { return test_name; } 102 103 protected: 104 static void add_benchmark(Benchmark *benchmark); 105 106 private: 107 BenchmarkResult run() { 108 BenchmarkOptions options; 109 return benchmark(options, func); 110 } 111 }; 112 113 // We want our random values to be approximately 114 // Output: a random number with the exponent field between min_exp and max_exp, 115 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1), 116 // Caveats: 117 // -EXP_BIAS corresponding to denormal values, 118 // EXP_BIAS + 1 corresponding to inf or nan. 119 template <typename T> 120 static T 121 get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS, 122 int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) { 123 using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>; 124 125 // Required to correctly instantiate FPBits for floats and doubles. 126 using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>), 127 uint64_t, uint32_t>; 128 RandType bits; 129 if constexpr (cpp::is_same_v<T, uint64_t>) 130 bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) | 131 static_cast<uint64_t>(LIBC_NAMESPACE::rand()); 132 else 133 bits = LIBC_NAMESPACE::rand(); 134 double scale = 135 static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1); 136 FPBits fp(bits); 137 fp.set_biased_exponent( 138 static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp)); 139 return fp.get_val(); 140 } 141 142 template <typename T> class MathPerf { 143 using FPBits = fputil::FPBits<T>; 144 using StorageType = typename FPBits::StorageType; 145 static constexpr StorageType UIntMax = 146 cpp::numeric_limits<StorageType>::max(); 147 148 public: 149 template <size_t N = 1> 150 static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) { 151 cpp::array<T, N> inputs; 152 for (size_t i = 0; i < N; ++i) 153 inputs[i] = get_rand_input<T>(min_exp, max_exp); 154 155 uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs); 156 157 return total_time / N; 158 } 159 160 // Throughput benchmarking for functions that take 2 inputs. 161 template <size_t N = 1> 162 static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp, 163 int arg1_max_exp, int arg2_min_exp, 164 int arg2_max_exp) { 165 cpp::array<T, N> inputs1; 166 cpp::array<T, N> inputs2; 167 for (size_t i = 0; i < N; ++i) { 168 inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp); 169 inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp); 170 } 171 172 uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2); 173 174 return total_time / N; 175 } 176 }; 177 178 } // namespace benchmarks 179 } // namespace LIBC_NAMESPACE_DECL 180 181 // Passing -1 indicates the benchmark should be run with as many threads as 182 // allocated by the user in the benchmark's CMake. 183 #define BENCHMARK(SuiteName, TestName, Func) \ 184 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ 185 Func, #SuiteName, #TestName, -1) 186 187 #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \ 188 LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ 189 Func, #SuiteName, #TestName, NumThreads) 190 191 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ 192 BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1) 193 194 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ 195 BENCHMARK_N_THREADS(SuiteName, TestName, Func, \ 196 LIBC_NAMESPACE::gpu::get_lane_size()) 197 #endif 198