xref: /llvm-project/flang/runtime/CUDA/kernel.cpp (revision 48657bf29b01e95749b5ecd8c7f675c14a7948d1)
1 //===-- runtime/CUDA/kernel.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "flang/Runtime/CUDA/kernel.h"
10 #include "../terminator.h"
11 #include "flang/Runtime/CUDA/common.h"
12 
13 #include "cuda_runtime.h"
14 
15 extern "C" {
16 
17 void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
18     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
19     int32_t smem, void **params, void **extra) {
20   dim3 gridDim;
21   gridDim.x = gridX;
22   gridDim.y = gridY;
23   gridDim.z = gridZ;
24   dim3 blockDim;
25   blockDim.x = blockX;
26   blockDim.y = blockY;
27   blockDim.z = blockZ;
28   unsigned nbNegGridDim{0};
29   if (gridX < 0) {
30     ++nbNegGridDim;
31   }
32   if (gridY < 0) {
33     ++nbNegGridDim;
34   }
35   if (gridZ < 0) {
36     ++nbNegGridDim;
37   }
38   if (nbNegGridDim == 1) {
39     int maxBlocks, nbBlocks, dev, multiProcCount;
40     cudaError_t err1, err2;
41     nbBlocks = blockDim.x * blockDim.y * blockDim.z;
42     cudaGetDevice(&dev);
43     err1 = cudaDeviceGetAttribute(
44         &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
45     err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
46         &maxBlocks, kernel, nbBlocks, smem);
47     if (err1 == cudaSuccess && err2 == cudaSuccess) {
48       maxBlocks = multiProcCount * maxBlocks;
49     }
50     if (maxBlocks > 0) {
51       if (gridX > 0) {
52         maxBlocks = maxBlocks / gridDim.x;
53       }
54       if (gridY > 0) {
55         maxBlocks = maxBlocks / gridDim.y;
56       }
57       if (gridZ > 0) {
58         maxBlocks = maxBlocks / gridDim.z;
59       }
60       if (maxBlocks < 1) {
61         maxBlocks = 1;
62       }
63       if (gridX < 0) {
64         gridDim.x = maxBlocks;
65       }
66       if (gridY < 0) {
67         gridDim.y = maxBlocks;
68       }
69       if (gridZ < 0) {
70         gridDim.z = maxBlocks;
71       }
72     }
73   } else if (nbNegGridDim > 1) {
74     Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
75     terminator.Crash("Too many invalid grid dimensions");
76   }
77   cudaStream_t stream = 0; // TODO stream managment
78   CUDA_REPORT_IF_ERROR(
79       cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
80 }
81 
82 void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
83     intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
84     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
85     int32_t smem, void **params, void **extra) {
86   cudaLaunchConfig_t config;
87   config.gridDim.x = gridX;
88   config.gridDim.y = gridY;
89   config.gridDim.z = gridZ;
90   config.blockDim.x = blockX;
91   config.blockDim.y = blockY;
92   config.blockDim.z = blockZ;
93   unsigned nbNegGridDim{0};
94   if (gridX < 0) {
95     ++nbNegGridDim;
96   }
97   if (gridY < 0) {
98     ++nbNegGridDim;
99   }
100   if (gridZ < 0) {
101     ++nbNegGridDim;
102   }
103   if (nbNegGridDim == 1) {
104     int maxBlocks, nbBlocks, dev, multiProcCount;
105     cudaError_t err1, err2;
106     nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
107     cudaGetDevice(&dev);
108     err1 = cudaDeviceGetAttribute(
109         &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
110     err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
111         &maxBlocks, kernel, nbBlocks, smem);
112     if (err1 == cudaSuccess && err2 == cudaSuccess) {
113       maxBlocks = multiProcCount * maxBlocks;
114     }
115     if (maxBlocks > 0) {
116       if (gridX > 0) {
117         maxBlocks = maxBlocks / config.gridDim.x;
118       }
119       if (gridY > 0) {
120         maxBlocks = maxBlocks / config.gridDim.y;
121       }
122       if (gridZ > 0) {
123         maxBlocks = maxBlocks / config.gridDim.z;
124       }
125       if (maxBlocks < 1) {
126         maxBlocks = 1;
127       }
128       if (gridX < 0) {
129         config.gridDim.x = maxBlocks;
130       }
131       if (gridY < 0) {
132         config.gridDim.y = maxBlocks;
133       }
134       if (gridZ < 0) {
135         config.gridDim.z = maxBlocks;
136       }
137     }
138   } else if (nbNegGridDim > 1) {
139     Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
140     terminator.Crash("Too many invalid grid dimensions");
141   }
142   config.dynamicSmemBytes = smem;
143   config.stream = 0; // TODO stream managment
144   cudaLaunchAttribute launchAttr[1];
145   launchAttr[0].id = cudaLaunchAttributeClusterDimension;
146   launchAttr[0].val.clusterDim.x = clusterX;
147   launchAttr[0].val.clusterDim.y = clusterY;
148   launchAttr[0].val.clusterDim.z = clusterZ;
149   config.numAttrs = 1;
150   config.attrs = launchAttr;
151   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
152 }
153 
154 void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
155     intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
156     intptr_t blockZ, int32_t smem, void **params, void **extra) {
157   dim3 gridDim;
158   gridDim.x = gridX;
159   gridDim.y = gridY;
160   gridDim.z = gridZ;
161   dim3 blockDim;
162   blockDim.x = blockX;
163   blockDim.y = blockY;
164   blockDim.z = blockZ;
165   unsigned nbNegGridDim{0};
166   if (gridX < 0) {
167     ++nbNegGridDim;
168   }
169   if (gridY < 0) {
170     ++nbNegGridDim;
171   }
172   if (gridZ < 0) {
173     ++nbNegGridDim;
174   }
175   if (nbNegGridDim == 1) {
176     int maxBlocks, nbBlocks, dev, multiProcCount;
177     cudaError_t err1, err2;
178     nbBlocks = blockDim.x * blockDim.y * blockDim.z;
179     cudaGetDevice(&dev);
180     err1 = cudaDeviceGetAttribute(
181         &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
182     err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
183         &maxBlocks, kernel, nbBlocks, smem);
184     if (err1 == cudaSuccess && err2 == cudaSuccess) {
185       maxBlocks = multiProcCount * maxBlocks;
186     }
187     if (maxBlocks > 0) {
188       if (gridX > 0) {
189         maxBlocks = maxBlocks / gridDim.x;
190       }
191       if (gridY > 0) {
192         maxBlocks = maxBlocks / gridDim.y;
193       }
194       if (gridZ > 0) {
195         maxBlocks = maxBlocks / gridDim.z;
196       }
197       if (maxBlocks < 1) {
198         maxBlocks = 1;
199       }
200       if (gridX < 0) {
201         gridDim.x = maxBlocks;
202       }
203       if (gridY < 0) {
204         gridDim.y = maxBlocks;
205       }
206       if (gridZ < 0) {
207         gridDim.z = maxBlocks;
208       }
209     }
210   } else if (nbNegGridDim > 1) {
211     Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
212     terminator.Crash("Too many invalid grid dimensions");
213   }
214   cudaStream_t stream = 0; // TODO stream managment
215   CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
216       kernel, gridDim, blockDim, params, smem, stream));
217 }
218 
219 } // extern "C"
220