1 //===------------- AMDGPU implementation of timing utils --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU 10 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU 11 12 #include "src/__support/CPP/array.h" 13 #include "src/__support/CPP/type_traits.h" 14 #include "src/__support/GPU/utils.h" 15 #include "src/__support/common.h" 16 #include "src/__support/macros/attributes.h" 17 #include "src/__support/macros/config.h" 18 19 #include <stdint.h> 20 21 namespace LIBC_NAMESPACE_DECL { 22 23 // Returns the overhead associated with calling the profiling region. This 24 // allows us to substract the constant-time overhead from the latency to 25 // obtain a true result. This can vary with system load. 26 [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() { 27 gpu::memory_fence(); 28 uint64_t start = gpu::processor_clock(); 29 uint32_t result = 0.0; 30 asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result)); 31 asm("" ::"s"(start)); 32 uint64_t stop = gpu::processor_clock(); 33 return stop - start; 34 } 35 36 // Profile a simple function and obtain its latency in clock cycles on the 37 // system. This function cannot be inlined or else it will disturb the very 38 // delicate balance of hard-coded dependencies. 39 template <typename F, typename T> 40 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { 41 // We need to store the input somewhere to guarantee that the compiler 42 // will not constant propagate it and remove the profiling region. 43 volatile T storage = t; 44 T arg = storage; 45 46 // The AMDGPU architecture needs to wait on pending results. 47 gpu::memory_fence(); 48 // Get the current timestamp from the clock. 49 uint64_t start = gpu::processor_clock(); 50 51 // This forces the compiler to load the input argument and run the clock 52 // cycle counter before the profiling region. 53 asm("" ::"s"(start)); 54 55 // Run the function under test and return its value. 56 auto result = f(arg); 57 58 // This inline assembly performs a no-op which forces the result to both 59 // be used and prevents us from exiting this region before it's complete. 60 if constexpr (cpp::is_same_v<decltype(result), char> || 61 cpp::is_same_v<decltype(result), bool>) 62 // AMDGPU does not support input register constraints for i1 and i8, so we 63 // cast it to a 32-bit integer. This does not add an additional assembly 64 // instruction (https://godbolt.org/z/zxGqv8G91). 65 asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"( 66 static_cast<uint32_t>(result))); 67 else 68 asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result)); 69 70 // Obtain the current timestamp after running the calculation and force 71 // ordering. 72 uint64_t stop = gpu::processor_clock(); 73 asm("" ::"s"(stop)); 74 gpu::memory_fence(); 75 76 // Return the time elapsed. 77 return stop - start; 78 } 79 80 template <typename F, typename T1, typename T2> 81 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { 82 volatile T1 storage1 = t1; 83 volatile T2 storage2 = t2; 84 T1 arg1 = storage1; 85 T2 arg2 = storage2; 86 87 gpu::memory_fence(); 88 uint64_t start = gpu::processor_clock(); 89 90 asm("" ::"s"(start)); 91 92 auto result = f(arg1, arg2); 93 94 if constexpr (cpp::is_same_v<decltype(result), char> || 95 cpp::is_same_v<decltype(result), bool>) 96 asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"( 97 static_cast<uint32_t>(result))); 98 else 99 asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result)); 100 101 uint64_t stop = gpu::processor_clock(); 102 asm("" ::"s"(stop)); 103 gpu::memory_fence(); 104 105 return stop - start; 106 } 107 108 // Provides throughput benchmarking. 109 template <typename F, typename T, size_t N> 110 [[gnu::noinline]] static LIBC_INLINE uint64_t 111 throughput(F f, const cpp::array<T, N> &inputs) { 112 asm("" ::"v"(&inputs)); 113 114 gpu::memory_fence(); 115 uint64_t start = gpu::processor_clock(); 116 117 asm("" ::"s"(start)); 118 119 for (auto input : inputs) { 120 auto result = f(input); 121 122 asm("" ::"v"(result)); 123 } 124 125 uint64_t stop = gpu::processor_clock(); 126 asm("" ::"s"(stop)); 127 gpu::memory_fence(); 128 129 // Return the time elapsed. 130 return stop - start; 131 } 132 133 // Provides throughput benchmarking for 2 arguments (e.g. atan2()) 134 template <typename F, typename T, size_t N> 135 [[gnu::noinline]] static LIBC_INLINE uint64_t throughput( 136 F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) { 137 asm("" ::"v"(&inputs1), "v"(&inputs2)); 138 139 gpu::memory_fence(); 140 uint64_t start = gpu::processor_clock(); 141 142 asm("" ::"s"(start)); 143 144 for (size_t i = 0; i < inputs1.size(); i++) { 145 auto result = f(inputs1[i], inputs2[i]); 146 147 asm("" ::"v"(result)); 148 } 149 150 uint64_t stop = gpu::processor_clock(); 151 asm("" ::"s"(stop)); 152 gpu::memory_fence(); 153 154 // Return the time elapsed. 155 return stop - start; 156 } 157 158 } // namespace LIBC_NAMESPACE_DECL 159 160 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU 161