1 //===-- runtime/CUDA/kernel.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "flang/Runtime/CUDA/kernel.h" 10 #include "../terminator.h" 11 #include "flang/Runtime/CUDA/common.h" 12 13 #include "cuda_runtime.h" 14 15 extern "C" { 16 17 void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, 18 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, 19 int32_t smem, void **params, void **extra) { 20 dim3 gridDim; 21 gridDim.x = gridX; 22 gridDim.y = gridY; 23 gridDim.z = gridZ; 24 dim3 blockDim; 25 blockDim.x = blockX; 26 blockDim.y = blockY; 27 blockDim.z = blockZ; 28 unsigned nbNegGridDim{0}; 29 if (gridX < 0) { 30 ++nbNegGridDim; 31 } 32 if (gridY < 0) { 33 ++nbNegGridDim; 34 } 35 if (gridZ < 0) { 36 ++nbNegGridDim; 37 } 38 if (nbNegGridDim == 1) { 39 int maxBlocks, nbBlocks, dev, multiProcCount; 40 cudaError_t err1, err2; 41 nbBlocks = blockDim.x * blockDim.y * blockDim.z; 42 cudaGetDevice(&dev); 43 err1 = cudaDeviceGetAttribute( 44 &multiProcCount, cudaDevAttrMultiProcessorCount, dev); 45 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( 46 &maxBlocks, kernel, nbBlocks, smem); 47 if (err1 == cudaSuccess && err2 == cudaSuccess) { 48 maxBlocks = multiProcCount * maxBlocks; 49 } 50 if (maxBlocks > 0) { 51 if (gridX > 0) { 52 maxBlocks = maxBlocks / gridDim.x; 53 } 54 if (gridY > 0) { 55 maxBlocks = maxBlocks / gridDim.y; 56 } 57 if (gridZ > 0) { 58 maxBlocks = maxBlocks / gridDim.z; 59 } 60 if (maxBlocks < 1) { 61 maxBlocks = 1; 62 } 63 if (gridX < 0) { 64 gridDim.x = maxBlocks; 65 } 66 if (gridY < 0) { 67 gridDim.y = maxBlocks; 68 } 69 if (gridZ < 0) { 70 gridDim.z = maxBlocks; 71 } 72 } 73 } else if (nbNegGridDim > 1) { 74 Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; 75 terminator.Crash("Too many invalid grid dimensions"); 76 } 77 cudaStream_t stream = 0; // TODO stream managment 78 CUDA_REPORT_IF_ERROR( 79 cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream)); 80 } 81 82 void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, 83 intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, 84 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, 85 int32_t smem, void **params, void **extra) { 86 cudaLaunchConfig_t config; 87 config.gridDim.x = gridX; 88 config.gridDim.y = gridY; 89 config.gridDim.z = gridZ; 90 config.blockDim.x = blockX; 91 config.blockDim.y = blockY; 92 config.blockDim.z = blockZ; 93 unsigned nbNegGridDim{0}; 94 if (gridX < 0) { 95 ++nbNegGridDim; 96 } 97 if (gridY < 0) { 98 ++nbNegGridDim; 99 } 100 if (gridZ < 0) { 101 ++nbNegGridDim; 102 } 103 if (nbNegGridDim == 1) { 104 int maxBlocks, nbBlocks, dev, multiProcCount; 105 cudaError_t err1, err2; 106 nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z; 107 cudaGetDevice(&dev); 108 err1 = cudaDeviceGetAttribute( 109 &multiProcCount, cudaDevAttrMultiProcessorCount, dev); 110 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( 111 &maxBlocks, kernel, nbBlocks, smem); 112 if (err1 == cudaSuccess && err2 == cudaSuccess) { 113 maxBlocks = multiProcCount * maxBlocks; 114 } 115 if (maxBlocks > 0) { 116 if (gridX > 0) { 117 maxBlocks = maxBlocks / config.gridDim.x; 118 } 119 if (gridY > 0) { 120 maxBlocks = maxBlocks / config.gridDim.y; 121 } 122 if (gridZ > 0) { 123 maxBlocks = maxBlocks / config.gridDim.z; 124 } 125 if (maxBlocks < 1) { 126 maxBlocks = 1; 127 } 128 if (gridX < 0) { 129 config.gridDim.x = maxBlocks; 130 } 131 if (gridY < 0) { 132 config.gridDim.y = maxBlocks; 133 } 134 if (gridZ < 0) { 135 config.gridDim.z = maxBlocks; 136 } 137 } 138 } else if (nbNegGridDim > 1) { 139 Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; 140 terminator.Crash("Too many invalid grid dimensions"); 141 } 142 config.dynamicSmemBytes = smem; 143 config.stream = 0; // TODO stream managment 144 cudaLaunchAttribute launchAttr[1]; 145 launchAttr[0].id = cudaLaunchAttributeClusterDimension; 146 launchAttr[0].val.clusterDim.x = clusterX; 147 launchAttr[0].val.clusterDim.y = clusterY; 148 launchAttr[0].val.clusterDim.z = clusterZ; 149 config.numAttrs = 1; 150 config.attrs = launchAttr; 151 CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); 152 } 153 154 void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, 155 intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, 156 intptr_t blockZ, int32_t smem, void **params, void **extra) { 157 dim3 gridDim; 158 gridDim.x = gridX; 159 gridDim.y = gridY; 160 gridDim.z = gridZ; 161 dim3 blockDim; 162 blockDim.x = blockX; 163 blockDim.y = blockY; 164 blockDim.z = blockZ; 165 unsigned nbNegGridDim{0}; 166 if (gridX < 0) { 167 ++nbNegGridDim; 168 } 169 if (gridY < 0) { 170 ++nbNegGridDim; 171 } 172 if (gridZ < 0) { 173 ++nbNegGridDim; 174 } 175 if (nbNegGridDim == 1) { 176 int maxBlocks, nbBlocks, dev, multiProcCount; 177 cudaError_t err1, err2; 178 nbBlocks = blockDim.x * blockDim.y * blockDim.z; 179 cudaGetDevice(&dev); 180 err1 = cudaDeviceGetAttribute( 181 &multiProcCount, cudaDevAttrMultiProcessorCount, dev); 182 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( 183 &maxBlocks, kernel, nbBlocks, smem); 184 if (err1 == cudaSuccess && err2 == cudaSuccess) { 185 maxBlocks = multiProcCount * maxBlocks; 186 } 187 if (maxBlocks > 0) { 188 if (gridX > 0) { 189 maxBlocks = maxBlocks / gridDim.x; 190 } 191 if (gridY > 0) { 192 maxBlocks = maxBlocks / gridDim.y; 193 } 194 if (gridZ > 0) { 195 maxBlocks = maxBlocks / gridDim.z; 196 } 197 if (maxBlocks < 1) { 198 maxBlocks = 1; 199 } 200 if (gridX < 0) { 201 gridDim.x = maxBlocks; 202 } 203 if (gridY < 0) { 204 gridDim.y = maxBlocks; 205 } 206 if (gridZ < 0) { 207 gridDim.z = maxBlocks; 208 } 209 } 210 } else if (nbNegGridDim > 1) { 211 Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; 212 terminator.Crash("Too many invalid grid dimensions"); 213 } 214 cudaStream_t stream = 0; // TODO stream managment 215 CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel( 216 kernel, gridDim, blockDim, params, smem, stream)); 217 } 218 219 } // extern "C" 220