1 #include "LibcGpuBenchmark.h" 2 #include "src/__support/CPP/algorithm.h" 3 #include "src/__support/CPP/array.h" 4 #include "src/__support/CPP/atomic.h" 5 #include "src/__support/CPP/string.h" 6 #include "src/__support/FPUtil/sqrt.h" 7 #include "src/__support/GPU/utils.h" 8 #include "src/__support/fixedvector.h" 9 #include "src/__support/macros/config.h" 10 #include "src/stdio/printf.h" 11 #include "src/stdlib/srand.h" 12 #include "src/time/gpu/time_utils.h" 13 14 namespace LIBC_NAMESPACE_DECL { 15 namespace benchmarks { 16 17 FixedVector<Benchmark *, 64> benchmarks; 18 19 void Benchmark::add_benchmark(Benchmark *benchmark) { 20 benchmarks.push_back(benchmark); 21 } 22 23 struct AtomicBenchmarkSums { 24 cpp::Atomic<uint64_t> cycles_sum = 0; 25 cpp::Atomic<uint64_t> standard_deviation_sum = 0; 26 cpp::Atomic<uint64_t> min = UINT64_MAX; 27 cpp::Atomic<uint64_t> max = 0; 28 cpp::Atomic<uint32_t> samples_sum = 0; 29 cpp::Atomic<uint32_t> iterations_sum = 0; 30 cpp::Atomic<clock_t> time_sum = 0; 31 cpp::Atomic<uint64_t> active_threads = 0; 32 33 void reset() { 34 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 35 active_threads.store(0, cpp::MemoryOrder::RELAXED); 36 cycles_sum.store(0, cpp::MemoryOrder::RELAXED); 37 standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED); 38 min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED); 39 max.store(0, cpp::MemoryOrder::RELAXED); 40 samples_sum.store(0, cpp::MemoryOrder::RELAXED); 41 iterations_sum.store(0, cpp::MemoryOrder::RELAXED); 42 time_sum.store(0, cpp::MemoryOrder::RELAXED); 43 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 44 } 45 46 void update(const BenchmarkResult &result) { 47 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 48 active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED); 49 50 cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED); 51 standard_deviation_sum.fetch_add( 52 static_cast<uint64_t>(result.standard_deviation), 53 cpp::MemoryOrder::RELAXED); 54 55 // Perform a CAS loop to atomically update the min 56 uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED); 57 while (!min.compare_exchange_strong( 58 orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, 59 cpp::MemoryOrder::RELAXED)) 60 ; 61 62 // Perform a CAS loop to atomically update the max 63 uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED); 64 while (!max.compare_exchange_strong( 65 orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE, 66 cpp::MemoryOrder::RELAXED)) 67 ; 68 69 samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED); 70 iterations_sum.fetch_add(result.total_iterations, 71 cpp::MemoryOrder::RELAXED); 72 time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED); 73 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 74 } 75 }; 76 77 AtomicBenchmarkSums all_results; 78 constexpr auto GREEN = "\033[32m"; 79 constexpr auto RESET = "\033[0m"; 80 81 void print_results(Benchmark *b) { 82 BenchmarkResult result; 83 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 84 int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED); 85 result.cycles = 86 all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 87 result.standard_deviation = 88 all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / 89 num_threads; 90 result.min = all_results.min.load(cpp::MemoryOrder::RELAXED); 91 result.max = all_results.max.load(cpp::MemoryOrder::RELAXED); 92 result.samples = 93 all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 94 result.total_iterations = 95 all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 96 const uint64_t duration_ns = 97 all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 98 const uint64_t duration_us = duration_ns / 1000; 99 const uint64_t duration_ms = duration_ns / (1000 * 1000); 100 uint64_t converted_duration = duration_ns; 101 const char *time_unit; 102 if (duration_ms != 0) { 103 converted_duration = duration_ms; 104 time_unit = "ms"; 105 } else if (duration_us != 0) { 106 converted_duration = duration_us; 107 time_unit = "us"; 108 } else { 109 converted_duration = duration_ns; 110 time_unit = "ns"; 111 } 112 result.total_time = converted_duration; 113 // result.total_time = 114 // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 115 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 116 117 LIBC_NAMESPACE::printf( 118 "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n", 119 b->get_test_name().data(), result.cycles, result.min, result.max, 120 result.total_iterations, result.total_time, time_unit, 121 static_cast<uint64_t>(result.standard_deviation), num_threads); 122 } 123 124 void print_header() { 125 LIBC_NAMESPACE::printf("%s", GREEN); 126 LIBC_NAMESPACE::printf("Running Suite: %-10s\n", 127 benchmarks[0]->get_suite_name().data()); 128 LIBC_NAMESPACE::printf("%s", RESET); 129 cpp::string titles = 130 "Benchmark | Cycles | Min | Max | " 131 "Iterations | Time / Iteration | Stddev | Threads |\n"; 132 LIBC_NAMESPACE::printf(titles.data()); 133 134 cpp::string separator(titles.size(), '-'); 135 separator[titles.size() - 1] = '\n'; 136 LIBC_NAMESPACE::printf(separator.data()); 137 } 138 139 void Benchmark::run_benchmarks() { 140 uint64_t id = gpu::get_thread_id(); 141 142 if (id == 0) { 143 print_header(); 144 LIBC_NAMESPACE::srand(gpu::processor_clock()); 145 } 146 147 gpu::sync_threads(); 148 149 for (Benchmark *b : benchmarks) { 150 if (id == 0) 151 all_results.reset(); 152 153 gpu::sync_threads(); 154 if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) { 155 auto current_result = b->run(); 156 all_results.update(current_result); 157 } 158 gpu::sync_threads(); 159 160 if (id == 0) 161 print_results(b); 162 } 163 gpu::sync_threads(); 164 } 165 166 BenchmarkResult benchmark(const BenchmarkOptions &options, 167 cpp::function<uint64_t(void)> wrapper_func) { 168 BenchmarkResult result; 169 RuntimeEstimationProgression rep; 170 uint32_t total_iterations = 0; 171 uint32_t iterations = options.initial_iterations; 172 if (iterations < 1u) 173 iterations = 1; 174 175 uint32_t samples = 0; 176 uint64_t total_time = 0; 177 uint64_t best_guess = 0; 178 uint64_t cycles_squared = 0; 179 uint64_t min = UINT64_MAX; 180 uint64_t max = 0; 181 182 uint64_t overhead = UINT64_MAX; 183 int overhead_iterations = 10; 184 for (int i = 0; i < overhead_iterations; i++) 185 overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); 186 187 for (int64_t time_budget = options.max_duration; time_budget >= 0;) { 188 uint64_t sample_cycles = 0; 189 const clock_t start = static_cast<double>(clock()); 190 for (uint32_t i = 0; i < iterations; i++) { 191 auto wrapper_intermediate = wrapper_func(); 192 uint64_t current_result = wrapper_intermediate - overhead; 193 max = cpp::max(max, current_result); 194 min = cpp::min(min, current_result); 195 sample_cycles += current_result; 196 } 197 const clock_t end = clock(); 198 const clock_t duration_ns = 199 ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; 200 total_time += duration_ns; 201 time_budget -= duration_ns; 202 samples++; 203 cycles_squared += sample_cycles * sample_cycles; 204 205 total_iterations += iterations; 206 const double change_ratio = 207 rep.compute_improvement({iterations, sample_cycles}); 208 best_guess = rep.current_estimation; 209 210 if (samples >= options.max_samples || iterations >= options.max_iterations) 211 break; 212 if (total_time >= options.min_duration && samples >= options.min_samples && 213 total_iterations >= options.min_iterations && 214 change_ratio < options.epsilon) 215 break; 216 217 iterations *= options.scaling_factor; 218 } 219 result.cycles = best_guess; 220 result.standard_deviation = fputil::sqrt<double>( 221 static_cast<double>(cycles_squared) / total_iterations - 222 static_cast<double>(best_guess * best_guess)); 223 result.min = min; 224 result.max = max; 225 result.samples = samples; 226 result.total_iterations = total_iterations; 227 result.total_time = total_time / total_iterations; 228 return result; 229 }; 230 231 } // namespace benchmarks 232 } // namespace LIBC_NAMESPACE_DECL 233