1 //===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the kernel entry points for the device. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "Shared/Environment.h" 14 15 #include "Allocator.h" 16 #include "Debug.h" 17 #include "DeviceTypes.h" 18 #include "Interface.h" 19 #include "Mapping.h" 20 #include "State.h" 21 #include "Synchronization.h" 22 #include "Workshare.h" 23 24 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" 25 26 using namespace ompx; 27 28 #pragma omp begin declare target device_type(nohost) 29 30 static void 31 inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, 32 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { 33 // Order is important here. 34 synchronize::init(IsSPMD); 35 mapping::init(IsSPMD); 36 state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); 37 allocator::init(IsSPMD, KernelEnvironment); 38 workshare::init(IsSPMD); 39 } 40 41 /// Simple generic state machine for worker threads. 42 static void genericStateMachine(IdentTy *Ident) { 43 uint32_t TId = mapping::getThreadIdInBlock(); 44 45 do { 46 ParallelRegionFnTy WorkFn = nullptr; 47 48 // Wait for the signal that we have a new work function. 49 synchronize::threads(atomic::seq_cst); 50 51 // Retrieve the work function from the runtime. 52 bool IsActive = __kmpc_kernel_parallel(&WorkFn); 53 54 // If there is nothing more to do, break out of the state machine by 55 // returning to the caller. 56 if (!WorkFn) 57 return; 58 59 if (IsActive) { 60 ASSERT(!mapping::isSPMDMode(), nullptr); 61 ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId); 62 __kmpc_kernel_end_parallel(); 63 } 64 65 synchronize::threads(atomic::seq_cst); 66 67 } while (true); 68 } 69 70 extern "C" { 71 72 /// Initialization 73 /// 74 /// \param Ident Source location identification, can be NULL. 75 /// 76 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, 77 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { 78 ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; 79 bool IsSPMD = Configuration.ExecMode & 80 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; 81 bool UseGenericStateMachine = Configuration.UseGenericStateMachine; 82 if (IsSPMD) { 83 inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, 84 KernelLaunchEnvironment); 85 synchronize::threadsAligned(atomic::relaxed); 86 } else { 87 inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, 88 KernelLaunchEnvironment); 89 // No need to wait since only the main threads will execute user 90 // code and workers will run into a barrier right away. 91 } 92 93 if (IsSPMD) { 94 state::assumeInitialState(IsSPMD); 95 96 // Synchronize to ensure the assertions above are in an aligned region. 97 // The barrier is eliminated later. 98 synchronize::threadsAligned(atomic::relaxed); 99 return -1; 100 } 101 102 if (mapping::isInitialThreadInLevel0(IsSPMD)) 103 return -1; 104 105 // Enter the generic state machine if enabled and if this thread can possibly 106 // be an active worker thread. 107 // 108 // The latter check is important for NVIDIA Pascal (but not Volta) and AMD 109 // GPU. In those cases, a single thread can apparently satisfy a barrier on 110 // behalf of all threads in the same warp. Thus, it would not be safe for 111 // other threads in the main thread's warp to reach the first 112 // synchronize::threads call in genericStateMachine before the main thread 113 // reaches its corresponding synchronize::threads call: that would permit all 114 // active worker threads to proceed before the main thread has actually set 115 // state::ParallelRegionFn, and then they would immediately quit without 116 // doing any work. mapping::getMaxTeamThreads() does not include any of the 117 // main thread's warp, so none of its threads can ever be active worker 118 // threads. 119 if (UseGenericStateMachine && 120 mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) 121 genericStateMachine(KernelEnvironment.Ident); 122 123 return mapping::getThreadIdInBlock(); 124 } 125 126 /// De-Initialization 127 /// 128 /// In non-SPMD, this function releases the workers trapped in a state machine 129 /// and also any memory dynamically allocated by the runtime. 130 /// 131 /// \param Ident Source location identification, can be NULL. 132 /// 133 void __kmpc_target_deinit() { 134 bool IsSPMD = mapping::isSPMDMode(); 135 if (IsSPMD) 136 return; 137 138 if (mapping::isInitialThreadInLevel0(IsSPMD)) { 139 // Signal the workers to exit the state machine and exit the kernel. 140 state::ParallelRegionFn = nullptr; 141 } else if (!state::getKernelEnvironment() 142 .Configuration.UseGenericStateMachine) { 143 // Retrieve the work function just to ensure we always call 144 // __kmpc_kernel_parallel even if a custom state machine is used. 145 // TODO: this is not super pretty. The problem is we create the call to 146 // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it 147 // is not there yet. Thus, we assume we never reach it from 148 // __kmpc_target_deinit. That allows us to remove the store in there to 149 // ParallelRegionFn, which leads to bad results later on. 150 ParallelRegionFnTy WorkFn = nullptr; 151 __kmpc_kernel_parallel(&WorkFn); 152 ASSERT(WorkFn == nullptr, nullptr); 153 } 154 } 155 156 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } 157 } 158 159 #pragma omp end declare target 160