102b57dedSjameshu15869 #include "LibcGpuBenchmark.h" 202b57dedSjameshu15869 #include "src/__support/CPP/algorithm.h" 302b57dedSjameshu15869 #include "src/__support/CPP/array.h" 4b42c332dSjameshu15869 #include "src/__support/CPP/atomic.h" 502b57dedSjameshu15869 #include "src/__support/CPP/string.h" 602b57dedSjameshu15869 #include "src/__support/FPUtil/sqrt.h" 702b57dedSjameshu15869 #include "src/__support/GPU/utils.h" 802b57dedSjameshu15869 #include "src/__support/fixedvector.h" 95ff3ff33SPetr Hosek #include "src/__support/macros/config.h" 10a964f2e8Sjameshu15869 #include "src/stdio/printf.h" 11677796caSjameshu15869 #include "src/stdlib/srand.h" 1202b57dedSjameshu15869 #include "src/time/gpu/time_utils.h" 1302b57dedSjameshu15869 145ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 1502b57dedSjameshu15869 namespace benchmarks { 1602b57dedSjameshu15869 1702b57dedSjameshu15869 FixedVector<Benchmark *, 64> benchmarks; 1802b57dedSjameshu15869 1902b57dedSjameshu15869 void Benchmark::add_benchmark(Benchmark *benchmark) { 2002b57dedSjameshu15869 benchmarks.push_back(benchmark); 2102b57dedSjameshu15869 } 2202b57dedSjameshu15869 23b42c332dSjameshu15869 struct AtomicBenchmarkSums { 24b42c332dSjameshu15869 cpp::Atomic<uint64_t> cycles_sum = 0; 25b42c332dSjameshu15869 cpp::Atomic<uint64_t> standard_deviation_sum = 0; 26b42c332dSjameshu15869 cpp::Atomic<uint64_t> min = UINT64_MAX; 27b42c332dSjameshu15869 cpp::Atomic<uint64_t> max = 0; 28b42c332dSjameshu15869 cpp::Atomic<uint32_t> samples_sum = 0; 29b42c332dSjameshu15869 cpp::Atomic<uint32_t> iterations_sum = 0; 30b42c332dSjameshu15869 cpp::Atomic<clock_t> time_sum = 0; 31b42c332dSjameshu15869 cpp::Atomic<uint64_t> active_threads = 0; 32b42c332dSjameshu15869 33b42c332dSjameshu15869 void reset() { 34b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 35b42c332dSjameshu15869 active_threads.store(0, cpp::MemoryOrder::RELAXED); 36b42c332dSjameshu15869 cycles_sum.store(0, cpp::MemoryOrder::RELAXED); 37b42c332dSjameshu15869 standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED); 38b42c332dSjameshu15869 min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED); 39b42c332dSjameshu15869 max.store(0, cpp::MemoryOrder::RELAXED); 40b42c332dSjameshu15869 samples_sum.store(0, cpp::MemoryOrder::RELAXED); 41b42c332dSjameshu15869 iterations_sum.store(0, cpp::MemoryOrder::RELAXED); 42b42c332dSjameshu15869 time_sum.store(0, cpp::MemoryOrder::RELAXED); 43b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 4402b57dedSjameshu15869 } 45b42c332dSjameshu15869 46b42c332dSjameshu15869 void update(const BenchmarkResult &result) { 47b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 48b42c332dSjameshu15869 active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED); 49b42c332dSjameshu15869 50b42c332dSjameshu15869 cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED); 51b42c332dSjameshu15869 standard_deviation_sum.fetch_add( 52b42c332dSjameshu15869 static_cast<uint64_t>(result.standard_deviation), 53b42c332dSjameshu15869 cpp::MemoryOrder::RELAXED); 54b42c332dSjameshu15869 55b42c332dSjameshu15869 // Perform a CAS loop to atomically update the min 56b42c332dSjameshu15869 uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED); 57b42c332dSjameshu15869 while (!min.compare_exchange_strong( 58b42c332dSjameshu15869 orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE, 59b42c332dSjameshu15869 cpp::MemoryOrder::RELAXED)) 60b42c332dSjameshu15869 ; 61b42c332dSjameshu15869 62b42c332dSjameshu15869 // Perform a CAS loop to atomically update the max 63b42c332dSjameshu15869 uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED); 64b42c332dSjameshu15869 while (!max.compare_exchange_strong( 65b42c332dSjameshu15869 orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE, 66b42c332dSjameshu15869 cpp::MemoryOrder::RELAXED)) 67b42c332dSjameshu15869 ; 68b42c332dSjameshu15869 69b42c332dSjameshu15869 samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED); 70b42c332dSjameshu15869 iterations_sum.fetch_add(result.total_iterations, 71b42c332dSjameshu15869 cpp::MemoryOrder::RELAXED); 72b42c332dSjameshu15869 time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED); 73b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 74b42c332dSjameshu15869 } 75b42c332dSjameshu15869 }; 76b42c332dSjameshu15869 77b42c332dSjameshu15869 AtomicBenchmarkSums all_results; 78b42c332dSjameshu15869 constexpr auto GREEN = "\033[32m"; 79b42c332dSjameshu15869 constexpr auto RESET = "\033[0m"; 80b42c332dSjameshu15869 81a964f2e8Sjameshu15869 void print_results(Benchmark *b) { 82b42c332dSjameshu15869 BenchmarkResult result; 83b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 84b42c332dSjameshu15869 int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED); 85b42c332dSjameshu15869 result.cycles = 86b42c332dSjameshu15869 all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 87b42c332dSjameshu15869 result.standard_deviation = 88b42c332dSjameshu15869 all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) / 89b42c332dSjameshu15869 num_threads; 90b42c332dSjameshu15869 result.min = all_results.min.load(cpp::MemoryOrder::RELAXED); 91b42c332dSjameshu15869 result.max = all_results.max.load(cpp::MemoryOrder::RELAXED); 92b42c332dSjameshu15869 result.samples = 93b42c332dSjameshu15869 all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 94b42c332dSjameshu15869 result.total_iterations = 95b42c332dSjameshu15869 all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 96a964f2e8Sjameshu15869 const uint64_t duration_ns = 97b42c332dSjameshu15869 all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 98a964f2e8Sjameshu15869 const uint64_t duration_us = duration_ns / 1000; 99a964f2e8Sjameshu15869 const uint64_t duration_ms = duration_ns / (1000 * 1000); 100a964f2e8Sjameshu15869 uint64_t converted_duration = duration_ns; 1016911f823SJoseph Huber const char *time_unit; 102a964f2e8Sjameshu15869 if (duration_ms != 0) { 103a964f2e8Sjameshu15869 converted_duration = duration_ms; 1046911f823SJoseph Huber time_unit = "ms"; 105a964f2e8Sjameshu15869 } else if (duration_us != 0) { 106a964f2e8Sjameshu15869 converted_duration = duration_us; 1076911f823SJoseph Huber time_unit = "us"; 108a964f2e8Sjameshu15869 } else { 109a964f2e8Sjameshu15869 converted_duration = duration_ns; 1106911f823SJoseph Huber time_unit = "ns"; 111a964f2e8Sjameshu15869 } 112a964f2e8Sjameshu15869 result.total_time = converted_duration; 113a964f2e8Sjameshu15869 // result.total_time = 114a964f2e8Sjameshu15869 // all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads; 115b42c332dSjameshu15869 cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); 116b42c332dSjameshu15869 117a964f2e8Sjameshu15869 LIBC_NAMESPACE::printf( 118*deb6b45cSjameshu15869 "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n", 1196911f823SJoseph Huber b->get_test_name().data(), result.cycles, result.min, result.max, 1206911f823SJoseph Huber result.total_iterations, result.total_time, time_unit, 1216911f823SJoseph Huber static_cast<uint64_t>(result.standard_deviation), num_threads); 122a964f2e8Sjameshu15869 } 123a964f2e8Sjameshu15869 124a964f2e8Sjameshu15869 void print_header() { 125a964f2e8Sjameshu15869 LIBC_NAMESPACE::printf("%s", GREEN); 126a964f2e8Sjameshu15869 LIBC_NAMESPACE::printf("Running Suite: %-10s\n", 127a964f2e8Sjameshu15869 benchmarks[0]->get_suite_name().data()); 128a964f2e8Sjameshu15869 LIBC_NAMESPACE::printf("%s", RESET); 1291248698eSjameshu15869 cpp::string titles = 13039826b10Sjameshu15869 "Benchmark | Cycles | Min | Max | " 1311248698eSjameshu15869 "Iterations | Time / Iteration | Stddev | Threads |\n"; 1321248698eSjameshu15869 LIBC_NAMESPACE::printf(titles.data()); 1331248698eSjameshu15869 1341248698eSjameshu15869 cpp::string separator(titles.size(), '-'); 1351248698eSjameshu15869 separator[titles.size() - 1] = '\n'; 1361248698eSjameshu15869 LIBC_NAMESPACE::printf(separator.data()); 13702b57dedSjameshu15869 } 13802b57dedSjameshu15869 13902b57dedSjameshu15869 void Benchmark::run_benchmarks() { 14002b57dedSjameshu15869 uint64_t id = gpu::get_thread_id(); 141a964f2e8Sjameshu15869 142677796caSjameshu15869 if (id == 0) { 143a964f2e8Sjameshu15869 print_header(); 144677796caSjameshu15869 LIBC_NAMESPACE::srand(gpu::processor_clock()); 145677796caSjameshu15869 } 146a964f2e8Sjameshu15869 14702b57dedSjameshu15869 gpu::sync_threads(); 14802b57dedSjameshu15869 149eeed5896Sjameshu15869 for (Benchmark *b : benchmarks) { 150b42c332dSjameshu15869 if (id == 0) 151b42c332dSjameshu15869 all_results.reset(); 152b42c332dSjameshu15869 15302b57dedSjameshu15869 gpu::sync_threads(); 154197b1422Sjameshu15869 if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) { 155b42c332dSjameshu15869 auto current_result = b->run(); 156b42c332dSjameshu15869 all_results.update(current_result); 1578badfcceSjameshu15869 } 158b42c332dSjameshu15869 gpu::sync_threads(); 159b42c332dSjameshu15869 160b42c332dSjameshu15869 if (id == 0) 161b42c332dSjameshu15869 print_results(b); 16202b57dedSjameshu15869 } 16302b57dedSjameshu15869 gpu::sync_threads(); 16402b57dedSjameshu15869 } 16502b57dedSjameshu15869 16602b57dedSjameshu15869 BenchmarkResult benchmark(const BenchmarkOptions &options, 16702b57dedSjameshu15869 cpp::function<uint64_t(void)> wrapper_func) { 16802b57dedSjameshu15869 BenchmarkResult result; 16902b57dedSjameshu15869 RuntimeEstimationProgression rep; 17002b57dedSjameshu15869 uint32_t total_iterations = 0; 17102b57dedSjameshu15869 uint32_t iterations = options.initial_iterations; 17202b57dedSjameshu15869 if (iterations < 1u) 17302b57dedSjameshu15869 iterations = 1; 17402b57dedSjameshu15869 17502b57dedSjameshu15869 uint32_t samples = 0; 17602b57dedSjameshu15869 uint64_t total_time = 0; 17702b57dedSjameshu15869 uint64_t best_guess = 0; 17802b57dedSjameshu15869 uint64_t cycles_squared = 0; 17902b57dedSjameshu15869 uint64_t min = UINT64_MAX; 18002b57dedSjameshu15869 uint64_t max = 0; 18102b57dedSjameshu15869 18202b57dedSjameshu15869 uint64_t overhead = UINT64_MAX; 18302b57dedSjameshu15869 int overhead_iterations = 10; 18402b57dedSjameshu15869 for (int i = 0; i < overhead_iterations; i++) 18502b57dedSjameshu15869 overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); 18602b57dedSjameshu15869 187f4e6ddbcSjameshu15869 for (int64_t time_budget = options.max_duration; time_budget >= 0;) { 18802b57dedSjameshu15869 uint64_t sample_cycles = 0; 18902b57dedSjameshu15869 const clock_t start = static_cast<double>(clock()); 19002b57dedSjameshu15869 for (uint32_t i = 0; i < iterations; i++) { 19102b57dedSjameshu15869 auto wrapper_intermediate = wrapper_func(); 192f4e6ddbcSjameshu15869 uint64_t current_result = wrapper_intermediate - overhead; 193f4e6ddbcSjameshu15869 max = cpp::max(max, current_result); 194f4e6ddbcSjameshu15869 min = cpp::min(min, current_result); 195f4e6ddbcSjameshu15869 sample_cycles += current_result; 19602b57dedSjameshu15869 } 19702b57dedSjameshu15869 const clock_t end = clock(); 19802b57dedSjameshu15869 const clock_t duration_ns = 19902b57dedSjameshu15869 ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; 20002b57dedSjameshu15869 total_time += duration_ns; 20102b57dedSjameshu15869 time_budget -= duration_ns; 20202b57dedSjameshu15869 samples++; 20302b57dedSjameshu15869 cycles_squared += sample_cycles * sample_cycles; 20402b57dedSjameshu15869 20502b57dedSjameshu15869 total_iterations += iterations; 20602b57dedSjameshu15869 const double change_ratio = 20702b57dedSjameshu15869 rep.compute_improvement({iterations, sample_cycles}); 20802b57dedSjameshu15869 best_guess = rep.current_estimation; 20902b57dedSjameshu15869 21002b57dedSjameshu15869 if (samples >= options.max_samples || iterations >= options.max_iterations) 21102b57dedSjameshu15869 break; 21202b57dedSjameshu15869 if (total_time >= options.min_duration && samples >= options.min_samples && 213a09c0f67Sjameshu15869 total_iterations >= options.min_iterations && 21402b57dedSjameshu15869 change_ratio < options.epsilon) 21502b57dedSjameshu15869 break; 21602b57dedSjameshu15869 21702b57dedSjameshu15869 iterations *= options.scaling_factor; 21802b57dedSjameshu15869 } 21902b57dedSjameshu15869 result.cycles = best_guess; 22002b57dedSjameshu15869 result.standard_deviation = fputil::sqrt<double>( 22102b57dedSjameshu15869 static_cast<double>(cycles_squared) / total_iterations - 22202b57dedSjameshu15869 static_cast<double>(best_guess * best_guess)); 22302b57dedSjameshu15869 result.min = min; 22402b57dedSjameshu15869 result.max = max; 22502b57dedSjameshu15869 result.samples = samples; 22602b57dedSjameshu15869 result.total_iterations = total_iterations; 22739826b10Sjameshu15869 result.total_time = total_time / total_iterations; 22802b57dedSjameshu15869 return result; 22902b57dedSjameshu15869 }; 23002b57dedSjameshu15869 23102b57dedSjameshu15869 } // namespace benchmarks 2325ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 233