1 //===------------- NVPTX implementation of timing utils ---------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX 10 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX 11 12 #include "src/__support/CPP/array.h" 13 #include "src/__support/CPP/type_traits.h" 14 #include "src/__support/GPU/utils.h" 15 #include "src/__support/common.h" 16 #include "src/__support/macros/attributes.h" 17 #include "src/__support/macros/config.h" 18 19 #include <stdint.h> 20 21 namespace LIBC_NAMESPACE_DECL { 22 23 // Returns the overhead associated with calling the profiling region. This 24 // allows us to substract the constant-time overhead from the latency to 25 // obtain a true result. This can vary with system load. 26 [[gnu::noinline]] static uint64_t overhead() { 27 volatile uint32_t x = 1; 28 uint32_t y = x; 29 uint64_t start = gpu::processor_clock(); 30 asm("" ::"llr"(start)); 31 uint32_t result = y; 32 asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result)); 33 uint64_t stop = gpu::processor_clock(); 34 volatile auto storage = result; 35 return stop - start; 36 } 37 38 // Stimulate a simple function and obtain its latency in clock cycles on the 39 // system. This function cannot be inlined or else it will disturb the very 40 // delicate balance of hard-coded dependencies. 41 template <typename F, typename T> 42 [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { 43 // We need to store the input somewhere to guarantee that the compiler will 44 // not constant propagate it and remove the profiling region. 45 volatile T storage = t; 46 T arg = storage; 47 48 // Get the current timestamp from the clock. 49 gpu::memory_fence(); 50 uint64_t start = gpu::processor_clock(); 51 52 // This forces the compiler to load the input argument and run the clock cycle 53 // counter before the profiling region. 54 asm("" ::"llr"(start)); 55 56 // Run the function under test and return its value. 57 auto result = f(arg); 58 59 // This inline assembly performs a no-op which forces the result to both be 60 // used and prevents us from exiting this region before it's complete. 61 asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result)); 62 63 // Obtain the current timestamp after running the calculation and force 64 // ordering. 65 uint64_t stop = gpu::processor_clock(); 66 gpu::memory_fence(); 67 asm("" ::"r"(stop)); 68 volatile T output = result; 69 70 // Return the time elapsed. 71 return stop - start; 72 } 73 74 template <typename F, typename T1, typename T2> 75 static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { 76 volatile T1 storage = t1; 77 volatile T2 storage2 = t2; 78 T1 arg = storage; 79 T2 arg2 = storage2; 80 81 gpu::memory_fence(); 82 uint64_t start = gpu::processor_clock(); 83 84 asm("" ::"llr"(start)); 85 86 auto result = f(arg, arg2); 87 88 asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result)); 89 90 uint64_t stop = gpu::processor_clock(); 91 gpu::memory_fence(); 92 asm("" ::"r"(stop)); 93 volatile auto output = result; 94 95 return stop - start; 96 } 97 98 // Provides throughput benchmarking. 99 template <typename F, typename T, size_t N> 100 [[gnu::noinline]] static LIBC_INLINE uint64_t 101 throughput(F f, const cpp::array<T, N> &inputs) { 102 asm("" ::"r"(&inputs)); 103 104 gpu::memory_fence(); 105 uint64_t start = gpu::processor_clock(); 106 107 asm("" ::"llr"(start)); 108 109 uint64_t result; 110 for (auto input : inputs) { 111 asm("" ::"r"(input)); 112 result = f(input); 113 asm("" ::"r"(result)); 114 } 115 116 uint64_t stop = gpu::processor_clock(); 117 gpu::memory_fence(); 118 asm("" ::"r"(stop)); 119 volatile auto output = result; 120 121 // Return the time elapsed. 122 return stop - start; 123 } 124 125 // Provides throughput benchmarking for 2 arguments (e.g. atan2()) 126 template <typename F, typename T, size_t N> 127 [[gnu::noinline]] static LIBC_INLINE uint64_t throughput( 128 F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) { 129 asm("" ::"r"(&inputs1), "r"(&inputs2)); 130 131 gpu::memory_fence(); 132 uint64_t start = gpu::processor_clock(); 133 134 asm("" ::"llr"(start)); 135 136 uint64_t result; 137 for (size_t i = 0; i < inputs1.size(); i++) { 138 result = f(inputs1[i], inputs2[i]); 139 asm("" ::"r"(result)); 140 } 141 142 uint64_t stop = gpu::processor_clock(); 143 gpu::memory_fence(); 144 asm("" ::"r"(stop)); 145 volatile auto output = result; 146 147 // Return the time elapsed. 148 return stop - start; 149 } 150 } // namespace LIBC_NAMESPACE_DECL 151 152 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX 153