xref: /llvm-project/libc/benchmarks/gpu/timing/amdgpu/timing.h (revision deb6b45c32687275a6d4e24326ffc9700f8ae52c)
1 //===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
10 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
11 
12 #include "src/__support/CPP/array.h"
13 #include "src/__support/CPP/type_traits.h"
14 #include "src/__support/GPU/utils.h"
15 #include "src/__support/common.h"
16 #include "src/__support/macros/attributes.h"
17 #include "src/__support/macros/config.h"
18 
19 #include <stdint.h>
20 
21 namespace LIBC_NAMESPACE_DECL {
22 
23 // Returns the overhead associated with calling the profiling region. This
24 // allows us to substract the constant-time overhead from the latency to
25 // obtain a true result. This can vary with system load.
26 [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
27   gpu::memory_fence();
28   uint64_t start = gpu::processor_clock();
29   uint32_t result = 0.0;
30   asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
31   asm("" ::"s"(start));
32   uint64_t stop = gpu::processor_clock();
33   return stop - start;
34 }
35 
36 // Profile a simple function and obtain its latency in clock cycles on the
37 // system. This function cannot be inlined or else it will disturb the very
38 // delicate balance of hard-coded dependencies.
39 template <typename F, typename T>
40 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
41   // We need to store the input somewhere to guarantee that the compiler
42   // will not constant propagate it and remove the profiling region.
43   volatile T storage = t;
44   T arg = storage;
45 
46   // The AMDGPU architecture needs to wait on pending results.
47   gpu::memory_fence();
48   // Get the current timestamp from the clock.
49   uint64_t start = gpu::processor_clock();
50 
51   // This forces the compiler to load the input argument and run the clock
52   // cycle counter before the profiling region.
53   asm("" ::"s"(start));
54 
55   // Run the function under test and return its value.
56   auto result = f(arg);
57 
58   // This inline assembly performs a no-op which forces the result to both
59   // be used and prevents us from exiting this region before it's complete.
60   if constexpr (cpp::is_same_v<decltype(result), char> ||
61                 cpp::is_same_v<decltype(result), bool>)
62     // AMDGPU does not support input register constraints for i1 and i8, so we
63     // cast it to a 32-bit integer. This does not add an additional assembly
64     // instruction (https://godbolt.org/z/zxGqv8G91).
65     asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
66         static_cast<uint32_t>(result)));
67   else
68     asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
69 
70   // Obtain the current timestamp after running the calculation and force
71   // ordering.
72   uint64_t stop = gpu::processor_clock();
73   asm("" ::"s"(stop));
74   gpu::memory_fence();
75 
76   // Return the time elapsed.
77   return stop - start;
78 }
79 
80 template <typename F, typename T1, typename T2>
81 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
82   volatile T1 storage1 = t1;
83   volatile T2 storage2 = t2;
84   T1 arg1 = storage1;
85   T2 arg2 = storage2;
86 
87   gpu::memory_fence();
88   uint64_t start = gpu::processor_clock();
89 
90   asm("" ::"s"(start));
91 
92   auto result = f(arg1, arg2);
93 
94   if constexpr (cpp::is_same_v<decltype(result), char> ||
95                 cpp::is_same_v<decltype(result), bool>)
96     asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
97         static_cast<uint32_t>(result)));
98   else
99     asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
100 
101   uint64_t stop = gpu::processor_clock();
102   asm("" ::"s"(stop));
103   gpu::memory_fence();
104 
105   return stop - start;
106 }
107 
108 // Provides throughput benchmarking.
109 template <typename F, typename T, size_t N>
110 [[gnu::noinline]] static LIBC_INLINE uint64_t
111 throughput(F f, const cpp::array<T, N> &inputs) {
112   asm("" ::"v"(&inputs));
113 
114   gpu::memory_fence();
115   uint64_t start = gpu::processor_clock();
116 
117   asm("" ::"s"(start));
118 
119   for (auto input : inputs) {
120     auto result = f(input);
121 
122     asm("" ::"v"(result));
123   }
124 
125   uint64_t stop = gpu::processor_clock();
126   asm("" ::"s"(stop));
127   gpu::memory_fence();
128 
129   // Return the time elapsed.
130   return stop - start;
131 }
132 
133 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
134 template <typename F, typename T, size_t N>
135 [[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
136     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
137   asm("" ::"v"(&inputs1), "v"(&inputs2));
138 
139   gpu::memory_fence();
140   uint64_t start = gpu::processor_clock();
141 
142   asm("" ::"s"(start));
143 
144   for (size_t i = 0; i < inputs1.size(); i++) {
145     auto result = f(inputs1[i], inputs2[i]);
146 
147     asm("" ::"v"(result));
148   }
149 
150   uint64_t stop = gpu::processor_clock();
151   asm("" ::"s"(stop));
152   gpu::memory_fence();
153 
154   // Return the time elapsed.
155   return stop - start;
156 }
157 
158 } // namespace LIBC_NAMESPACE_DECL
159 
160 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
161