xref: /llvm-project/libc/benchmarks/gpu/LibcGpuBenchmark.h (revision deb6b45c32687275a6d4e24326ffc9700f8ae52c)
1 #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
2 #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
3 
4 #include "benchmarks/gpu/BenchmarkLogger.h"
5 #include "benchmarks/gpu/timing/timing.h"
6 #include "src/__support/CPP/array.h"
7 #include "src/__support/CPP/functional.h"
8 #include "src/__support/CPP/limits.h"
9 #include "src/__support/CPP/string_view.h"
10 #include "src/__support/CPP/type_traits.h"
11 #include "src/__support/FPUtil/FPBits.h"
12 #include "src/__support/macros/config.h"
13 #include "src/stdlib/rand.h"
14 #include "src/time/clock.h"
15 
16 #include <stdint.h>
17 
18 namespace LIBC_NAMESPACE_DECL {
19 
20 namespace benchmarks {
21 
22 struct BenchmarkOptions {
23   uint32_t initial_iterations = 1;
24   uint32_t min_iterations = 1;
25   uint32_t max_iterations = 10000000;
26   uint32_t min_samples = 4;
27   uint32_t max_samples = 1000;
28   int64_t min_duration = 500 * 1000;         // 500 * 1000 nanoseconds = 500 us
29   int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
30   double epsilon = 0.0001;
31   double scaling_factor = 1.4;
32 };
33 
34 struct Measurement {
35   uint32_t iterations = 0;
36   uint64_t elapsed_cycles = 0;
37 };
38 
39 class RefinableRuntimeEstimation {
40   uint64_t total_cycles = 0;
41   uint32_t total_iterations = 0;
42 
43 public:
44   uint64_t update(const Measurement &M) {
45     total_cycles += M.elapsed_cycles;
46     total_iterations += M.iterations;
47     return total_cycles / total_iterations;
48   }
49 };
50 
51 // Tracks the progression of the runtime estimation
52 class RuntimeEstimationProgression {
53   RefinableRuntimeEstimation rre;
54 
55 public:
56   uint64_t current_estimation = 0;
57 
58   double compute_improvement(const Measurement &M) {
59     const uint64_t new_estimation = rre.update(M);
60     double ratio =
61         (static_cast<double>(current_estimation) / new_estimation) - 1.0;
62 
63     // Get absolute value
64     if (ratio < 0)
65       ratio *= -1;
66 
67     current_estimation = new_estimation;
68     return ratio;
69   }
70 };
71 
72 struct BenchmarkResult {
73   uint64_t cycles = 0;
74   double standard_deviation = 0;
75   uint64_t min = UINT64_MAX;
76   uint64_t max = 0;
77   uint32_t samples = 0;
78   uint32_t total_iterations = 0;
79   clock_t total_time = 0;
80 };
81 
82 BenchmarkResult benchmark(const BenchmarkOptions &options,
83                           cpp::function<uint64_t(void)> wrapper_func);
84 
85 class Benchmark {
86   const cpp::function<uint64_t(void)> func;
87   const cpp::string_view suite_name;
88   const cpp::string_view test_name;
89   const uint32_t num_threads;
90 
91 public:
92   Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
93             char const *test_name, uint32_t num_threads)
94       : func(func), suite_name(suite_name), test_name(test_name),
95         num_threads(num_threads) {
96     add_benchmark(this);
97   }
98 
99   static void run_benchmarks();
100   const cpp::string_view get_suite_name() const { return suite_name; }
101   const cpp::string_view get_test_name() const { return test_name; }
102 
103 protected:
104   static void add_benchmark(Benchmark *benchmark);
105 
106 private:
107   BenchmarkResult run() {
108     BenchmarkOptions options;
109     return benchmark(options, func);
110   }
111 };
112 
113 // We want our random values to be approximately
114 // Output: a random number with the exponent field between min_exp and max_exp,
115 // i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
116 // Caveats:
117 //   -EXP_BIAS corresponding to denormal values,
118 //   EXP_BIAS + 1 corresponding to inf or nan.
119 template <typename T>
120 static T
121 get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
122                int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
123   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
124 
125   // Required to correctly instantiate FPBits for floats and doubles.
126   using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
127                                                uint64_t, uint32_t>;
128   RandType bits;
129   if constexpr (cpp::is_same_v<T, uint64_t>)
130     bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
131            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
132   else
133     bits = LIBC_NAMESPACE::rand();
134   double scale =
135       static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
136   FPBits fp(bits);
137   fp.set_biased_exponent(
138       static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
139   return fp.get_val();
140 }
141 
142 template <typename T> class MathPerf {
143   using FPBits = fputil::FPBits<T>;
144   using StorageType = typename FPBits::StorageType;
145   static constexpr StorageType UIntMax =
146       cpp::numeric_limits<StorageType>::max();
147 
148 public:
149   template <size_t N = 1>
150   static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
151     cpp::array<T, N> inputs;
152     for (size_t i = 0; i < N; ++i)
153       inputs[i] = get_rand_input<T>(min_exp, max_exp);
154 
155     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
156 
157     return total_time / N;
158   }
159 
160   // Throughput benchmarking for functions that take 2 inputs.
161   template <size_t N = 1>
162   static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
163                                           int arg1_max_exp, int arg2_min_exp,
164                                           int arg2_max_exp) {
165     cpp::array<T, N> inputs1;
166     cpp::array<T, N> inputs2;
167     for (size_t i = 0; i < N; ++i) {
168       inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
169       inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
170     }
171 
172     uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
173 
174     return total_time / N;
175   }
176 };
177 
178 } // namespace benchmarks
179 } // namespace LIBC_NAMESPACE_DECL
180 
181 // Passing -1 indicates the benchmark should be run with as many threads as
182 // allocated by the user in the benchmark's CMake.
183 #define BENCHMARK(SuiteName, TestName, Func)                                   \
184   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
185       Func, #SuiteName, #TestName, -1)
186 
187 #define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
188   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
189       Func, #SuiteName, #TestName, NumThreads)
190 
191 #define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
192   BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
193 
194 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
195   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
196                       LIBC_NAMESPACE::gpu::get_lane_size())
197 #endif
198