xref: /llvm-project/libc/benchmarks/gpu/timing/nvptx/timing.h (revision deb6b45c32687275a6d4e24326ffc9700f8ae52c)
1 //===------------- NVPTX implementation of timing utils ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
10 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
11 
12 #include "src/__support/CPP/array.h"
13 #include "src/__support/CPP/type_traits.h"
14 #include "src/__support/GPU/utils.h"
15 #include "src/__support/common.h"
16 #include "src/__support/macros/attributes.h"
17 #include "src/__support/macros/config.h"
18 
19 #include <stdint.h>
20 
21 namespace LIBC_NAMESPACE_DECL {
22 
23 // Returns the overhead associated with calling the profiling region. This
24 // allows us to substract the constant-time overhead from the latency to
25 // obtain a true result. This can vary with system load.
26 [[gnu::noinline]] static uint64_t overhead() {
27   volatile uint32_t x = 1;
28   uint32_t y = x;
29   uint64_t start = gpu::processor_clock();
30   asm("" ::"llr"(start));
31   uint32_t result = y;
32   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
33   uint64_t stop = gpu::processor_clock();
34   volatile auto storage = result;
35   return stop - start;
36 }
37 
38 // Stimulate a simple function and obtain its latency in clock cycles on the
39 // system. This function cannot be inlined or else it will disturb the very
40 // delicate balance of hard-coded dependencies.
41 template <typename F, typename T>
42 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
43   // We need to store the input somewhere to guarantee that the compiler will
44   // not constant propagate it and remove the profiling region.
45   volatile T storage = t;
46   T arg = storage;
47 
48   // Get the current timestamp from the clock.
49   gpu::memory_fence();
50   uint64_t start = gpu::processor_clock();
51 
52   // This forces the compiler to load the input argument and run the clock cycle
53   // counter before the profiling region.
54   asm("" ::"llr"(start));
55 
56   // Run the function under test and return its value.
57   auto result = f(arg);
58 
59   // This inline assembly performs a no-op which forces the result to both be
60   // used and prevents us from exiting this region before it's complete.
61   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
62 
63   // Obtain the current timestamp after running the calculation and force
64   // ordering.
65   uint64_t stop = gpu::processor_clock();
66   gpu::memory_fence();
67   asm("" ::"r"(stop));
68   volatile T output = result;
69 
70   // Return the time elapsed.
71   return stop - start;
72 }
73 
74 template <typename F, typename T1, typename T2>
75 static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
76   volatile T1 storage = t1;
77   volatile T2 storage2 = t2;
78   T1 arg = storage;
79   T2 arg2 = storage2;
80 
81   gpu::memory_fence();
82   uint64_t start = gpu::processor_clock();
83 
84   asm("" ::"llr"(start));
85 
86   auto result = f(arg, arg2);
87 
88   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
89 
90   uint64_t stop = gpu::processor_clock();
91   gpu::memory_fence();
92   asm("" ::"r"(stop));
93   volatile auto output = result;
94 
95   return stop - start;
96 }
97 
98 // Provides throughput benchmarking.
99 template <typename F, typename T, size_t N>
100 [[gnu::noinline]] static LIBC_INLINE uint64_t
101 throughput(F f, const cpp::array<T, N> &inputs) {
102   asm("" ::"r"(&inputs));
103 
104   gpu::memory_fence();
105   uint64_t start = gpu::processor_clock();
106 
107   asm("" ::"llr"(start));
108 
109   uint64_t result;
110   for (auto input : inputs) {
111     asm("" ::"r"(input));
112     result = f(input);
113     asm("" ::"r"(result));
114   }
115 
116   uint64_t stop = gpu::processor_clock();
117   gpu::memory_fence();
118   asm("" ::"r"(stop));
119   volatile auto output = result;
120 
121   // Return the time elapsed.
122   return stop - start;
123 }
124 
125 // Provides throughput benchmarking for 2 arguments (e.g. atan2())
126 template <typename F, typename T, size_t N>
127 [[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
128     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
129   asm("" ::"r"(&inputs1), "r"(&inputs2));
130 
131   gpu::memory_fence();
132   uint64_t start = gpu::processor_clock();
133 
134   asm("" ::"llr"(start));
135 
136   uint64_t result;
137   for (size_t i = 0; i < inputs1.size(); i++) {
138     result = f(inputs1[i], inputs2[i]);
139     asm("" ::"r"(result));
140   }
141 
142   uint64_t stop = gpu::processor_clock();
143   gpu::memory_fence();
144   asm("" ::"r"(stop));
145   volatile auto output = result;
146 
147   // Return the time elapsed.
148   return stop - start;
149 }
150 } // namespace LIBC_NAMESPACE_DECL
151 
152 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
153