xref: /llvm-project/mlir/test/Dialect/GPU/outlining.mlir (revision 516d6ede122086027baa2288623605a423375e87)
1// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
2// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s
3
4// CHECK: module attributes {gpu.container_module}
5
6// CHECK-LABEL: func @launch()
7func.func @launch() {
8  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
9  %0 = "op"() : () -> (f32)
10  // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
11  %1 = "op"() : () -> (memref<?xf32, 1>)
12  // CHECK: %[[GDIMX:.*]] = arith.constant 8
13  %gDimX = arith.constant 8 : index
14  // CHECK: %[[GDIMY:.*]] = arith.constant 12
15  %gDimY = arith.constant 12 : index
16  // CHECK: %[[GDIMZ:.*]] = arith.constant 16
17  %gDimZ = arith.constant 16 : index
18  // CHECK: %[[BDIMX:.*]] = arith.constant 20
19  %bDimX = arith.constant 20 : index
20  // CHECK: %[[BDIMY:.*]] = arith.constant 24
21  %bDimY = arith.constant 24 : index
22  // CHECK: %[[BDIMZ:.*]] = arith.constant 28
23  %bDimZ = arith.constant 28 : index
24
25  // CHECK: gpu.launch_func @launch_kernel::@launch_kernel blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>)
26  // CHECK-NOT: gpu.launch blocks
27  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
28                                       %grid_z = %gDimZ)
29             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
30                                        %block_z = %bDimZ) {
31    "use"(%0): (f32) -> ()
32    "some_op"(%bx, %block_x) : (index, index) -> ()
33    %42 = memref.load %1[%tx] : memref<?xf32, 1>
34    gpu.terminator
35  }
36  return
37}
38
39// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
40// CHECK-LABEL: gpu.module @launch_kernel
41// CHECK-NEXT: gpu.func @launch_kernel
42// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
43// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
44// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
45// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
46// CHECK-NEXT: = gpu.block_id y
47// CHECK-NEXT: = gpu.block_id z
48// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x
49// CHECK-NEXT: = gpu.thread_id y
50// CHECK-NEXT: = gpu.thread_id z
51// CHECK-NEXT: = gpu.grid_dim x
52// CHECK-NEXT: = gpu.grid_dim y
53// CHECK-NEXT: = gpu.grid_dim z
54// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
55// CHECK-NEXT: = gpu.block_dim y
56// CHECK-NEXT: = gpu.block_dim z
57// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
58// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
59// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
60
61// -----
62
63// Verify that we can outline a CFG
64// CHECK-LABEL:  gpu.func @launchCFG_kernel(
65// CHECK: cf.br
66// CHECK: gpu.return
67func.func @launchCFG() {
68  %0 = "op"() : () -> (f32)
69  %1 = "op"() : () -> (memref<?xf32, 1>)
70  %gDimX = arith.constant 8 : index
71  %gDimY = arith.constant 12 : index
72  %gDimZ = arith.constant 16 : index
73  %bDimX = arith.constant 20 : index
74  %bDimY = arith.constant 24 : index
75  %bDimZ = arith.constant 28 : index
76
77  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
78                                       %grid_z = %gDimZ)
79             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
80                                        %block_z = %bDimZ) {
81    "use"(%0): (f32) -> ()
82    cf.br ^bb1
83  ^bb1:
84    "some_op"(%bx, %block_x) : (index, index) -> ()
85    %42 = memref.load %1[%tx] : memref<?xf32, 1>
86    gpu.terminator
87  }
88  return
89}
90
91
92// -----
93
94// This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
95// CHECK-LABEL: @launch_from_llvm_func
96llvm.func @launch_from_llvm_func() {
97  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
98  %0 = "op"() : () -> (f32)
99  // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
100  %1 = "op"() : () -> (memref<?xf32, 1>)
101
102  // CHECK: %[[DIM:.*]] = arith.constant 1
103  %dim = arith.constant 1 : index
104
105  // CHECK: gpu.launch_func @launch_from_llvm_func_kernel::@launch_from_llvm_func_kernel
106  // CHECK-SAME: (%[[DIM]], %[[DIM]], %[[DIM]])
107  // CHECK-SAME: (%[[DIM]], %[[DIM]], %[[DIM]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>)
108  // CHECK-NEXT: llvm.return
109
110  // CHECK: gpu.func {{.*}} kernel attributes
111  // CHECK-SAME: known_block_size = array<i32: 1, 1, 1>
112  // CHECK-SAME: known_grid_size = array<i32: 1, 1, 1>
113  // CHECK: gpu.return
114  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %dim, %grid_y = %dim,
115                                       %grid_z = %dim)
116             threads(%tx, %ty, %tz) in (%block_x = %dim, %block_y = %dim,
117                                        %block_z = %dim) {
118    "use"(%0): (f32) -> ()
119    "some_op"(%bx, %block_x) : (index, index) -> ()
120    %2 = memref.load %1[%tx] : memref<?xf32, 1>
121    gpu.terminator
122  }
123  llvm.return
124}
125
126// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
127
128// -----
129
130// CHECK: module attributes {gpu.container_module}
131// CHECK-LABEL: @multiple_launches
132func.func @multiple_launches() {
133  // CHECK: %[[CST:.*]] = arith.constant 8 : index
134  %cst = arith.constant 8 : index
135  // CHECK: gpu.launch_func @multiple_launches_kernel::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
136  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
137                                       %grid_z = %cst)
138             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
139                                        %block_z = %cst) {
140    gpu.terminator
141  }
142  // CHECK: gpu.launch_func @multiple_launches_kernel_0::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
143  gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
144                                          %grid_z2 = %cst)
145             threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
146                                           %block_z2 = %cst) {
147    gpu.terminator
148  }
149
150  // With async and async deps.
151  // CHECK: %[[TOKEN:.*]] = gpu.wait async
152  // CHECK: gpu.launch_func async [%[[TOKEN]]] @multiple_launches_kernel_1::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
153  %t = gpu.wait async
154  %u = gpu.launch async [%t] blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
155                                          %grid_z2 = %cst)
156             threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
157                                           %block_z2 = %cst) {
158    gpu.terminator
159  }
160
161  // CHECK: gpu.launch_func async @multiple_launches_kernel_2::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]])
162  %v = gpu.launch async blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
163                                     %grid_z2 = %cst)
164             threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
165                                           %block_z2 = %cst) {
166    gpu.terminator
167  }
168
169  return
170}
171
172// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
173// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
174
175// CHECK: gpu.module @multiple_launches_kernel
176// CHECK: func @multiple_launches_kernel
177// CHECK: module @multiple_launches_kernel_0
178// CHECK: func @multiple_launches_kernel
179
180// -----
181
182// CHECK-LABEL: @extra_constants_not_inlined
183func.func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
184  // CHECK: %[[CST:.*]] = arith.constant 8 : index
185  %cst = arith.constant 8 : index
186  %cst2 = arith.constant 2 : index
187  %c0 = arith.constant 0 : index
188  %cst3 = "secret_constant"() : () -> index
189  // CHECK: gpu.launch_func @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args({{.*}} : memref<?xf32>, {{.*}} : index)
190  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
191                                       %grid_z = %cst)
192             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
193                                        %block_z = %cst) {
194    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
195    gpu.terminator
196  }
197  return
198}
199
200// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
201
202// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
203// CHECK: arith.constant 2
204
205// -----
206
207// CHECK-LABEL: @extra_constants
208// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
209func.func @extra_constants(%arg0: memref<?xf32>) {
210  // CHECK: %[[CST:.*]] = arith.constant 8 : index
211  %cst = arith.constant 8 : index
212  %cst2 = arith.constant 2 : index
213  %c0 = arith.constant 0 : index
214  %cst3 = memref.dim %arg0, %c0 : memref<?xf32>
215  // CHECK: gpu.launch_func @extra_constants_kernel::@extra_constants_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args(%[[ARG0]] : memref<?xf32>)
216  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
217                                       %grid_z = %cst)
218             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
219                                        %block_z = %cst) {
220    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
221    gpu.terminator
222  }
223  return
224}
225
226// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
227
228// CHECK-LABEL: func @extra_constants_kernel(
229// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
230// CHECK: arith.constant 2
231// CHECK: arith.constant 0
232// CHECK: memref.dim %[[KARG0]]
233
234// -----
235
236// CHECK-LABEL: @extra_constants_noarg
237// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
238func.func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
239  // CHECK: %[[CST:.*]] = arith.constant 8 : index
240  %cst = arith.constant 8 : index
241  %cst2 = arith.constant 2 : index
242  %c0 = arith.constant 0 : index
243  // CHECK: memref.dim %[[ARG1]]
244  %cst3 = memref.dim %arg1, %c0 : memref<?xf32>
245  // CHECK: gpu.launch_func @extra_constants_noarg_kernel::@extra_constants_noarg_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args(%[[ARG0]] : memref<?xf32>, {{.*}} : index)
246  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
247                                       %grid_z = %cst)
248             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
249                                        %block_z = %cst) {
250    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
251    gpu.terminator
252  }
253  return
254}
255
256// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
257
258// CHECK-LABEL: func @extra_constants_noarg_kernel(
259// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
260// CHECK: %[[KCST:.*]] = arith.constant 2
261// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])
262
263// -----
264
265// CHECK-LABEL: @multiple_uses
266func.func @multiple_uses(%arg0 : memref<?xf32>) {
267  %c1 = arith.constant 1 : index
268  %c2 = arith.constant 2 : index
269  // CHECK: gpu.func {{.*}} {
270  // CHECK:   %[[C2:.*]] = arith.constant 2 : index
271  // CHECK:   "use1"(%[[C2]], %[[C2]])
272  // CHECK:   "use2"(%[[C2]])
273  // CHECK:   gpu.return
274  // CHECK: }
275  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
276                                       %grid_z = %c1)
277             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
278                                        %block_z = %c1) {
279    "use1"(%c2, %c2) : (index, index) -> ()
280    "use2"(%c2) : (index) -> ()
281    gpu.terminator
282  }
283  return
284}
285
286// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
287
288// -----
289
290// CHECK-LABEL: @multiple_uses2
291func.func @multiple_uses2(%arg0 : memref<*xf32>) {
292  %c1 = arith.constant 1 : index
293  %c2 = arith.constant 2 : index
294  %d = memref.dim %arg0, %c2 : memref<*xf32>
295  // CHECK: gpu.func {{.*}} {
296  // CHECK:   %[[C2:.*]] = arith.constant 2 : index
297  // CHECK:   %[[D:.*]] = memref.dim %[[ARG:.*]], %[[C2]]
298  // CHECK:   "use1"(%[[D]])
299  // CHECK:   "use2"(%[[C2]], %[[C2]])
300  // CHECK:   "use3"(%[[ARG]])
301  // CHECK:   gpu.return
302  // CHECK: }
303  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
304                                       %grid_z = %c1)
305             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
306                                        %block_z = %c1) {
307    "use1"(%d) : (index) -> ()
308    "use2"(%c2, %c2) : (index, index) -> ()
309    "use3"(%arg0) : (memref<*xf32>) -> ()
310    gpu.terminator
311  }
312  return
313}
314
315// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
316
317// -----
318
319llvm.mlir.global internal @global(42 : i64) : i64
320
321//CHECK-LABEL: @function_call
322func.func @function_call(%arg0 : memref<?xf32>) {
323  %cst = arith.constant 8 : index
324  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
325                                       %grid_z = %cst)
326             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
327                                        %block_z = %cst) {
328    func.call @device_function() : () -> ()
329    func.call @device_function() : () -> ()
330    %0 = llvm.mlir.addressof @global : !llvm.ptr
331    gpu.terminator
332  }
333  return
334}
335
336func.func @device_function() {
337  call @recursive_device_function() : () -> ()
338  return
339}
340
341func.func @recursive_device_function() {
342  call @recursive_device_function() : () -> ()
343  return
344}
345
346// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
347
348// CHECK: gpu.module @function_call_kernel {
349// CHECK:   gpu.func @function_call_kernel()
350// CHECK:     call @device_function() : () -> ()
351// CHECK:     call @device_function() : () -> ()
352// CHECK:     llvm.mlir.addressof @global : !llvm.ptr
353// CHECK:     gpu.return
354//
355// CHECK:   llvm.mlir.global internal @global(42 : i64) {addr_space = 0 : i32} : i64
356//
357// CHECK:   func @device_function()
358// CHECK:   func @recursive_device_function()
359// CHECK-NOT:   func @device_function
360
361// -----
362
363// CHECK-LABEL: @non_constant_launches
364func.func @non_constant_launches(%arg0 : index) {
365  // CHECK-NOT: known_block_size
366  // CHECK-NOT: known_grid_size
367  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %arg0, %grid_y = %arg0,
368                                       %grid_z = %arg0)
369             threads(%tx, %ty, %tz) in (%block_x = %arg0, %block_y = %arg0,
370                                        %block_z = %arg0) {
371    gpu.terminator
372  }
373  return
374}
375
376// CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
377
378// CHECK: module attributes {gpu.container_module}
379
380// -----
381
382// This test checks memory attributions for gpu.launch, using both workgroup and private attributions.
383// CHECK-LABEL: func @launch_memory_attributions_0()
384func.func @launch_memory_attributions_0() {
385  %1 = "op"() : () -> (memref<?xf32, 1>)
386  %128 = arith.constant 128 : index
387
388  // CHECK: gpu.launch_func @launch_memory_attributions_0_kernel::@launch_memory_attributions_0_kernel
389  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %128, %grid_y = %128,
390                                       %grid_z = %128)
391             threads(%tx, %ty, %tz) in (%block_x = %128, %block_y = %128,
392                                        %block_z = %128)
393             workgroup(%shared: memref<42xf32, 3>)
394             private(%priv0: memref<2xf32, 5>, %priv1: memref<1xf32, 5>) {
395    "some_op"(%bx, %block_x) : (index, index) -> ()
396    %42 = memref.load %1[%tx] : memref<?xf32, 1>
397    %43 = memref.load %shared[%tx] : memref<42xf32, 3>
398    %44 = memref.load %priv1[%tx] : memref<1xf32, 5>
399    gpu.terminator
400  }
401  return
402}
403
404// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
405
406// CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel
407// CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel
408// CHECK-SAME: workgroup(%[[KERNEL_ARG1:.*]] : memref<42xf32, 3>)
409// CHECK-SAME: private(%[[KERNEL_ARG2:.*]] : memref<2xf32, 5>, %[[KERNEL_ARG3:.*]] : memref<1xf32, 5>)
410// CHECK: %[[TID:.*]] = gpu.thread_id x
411// CHECK: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<42xf32, 3>
412// CHECK-NEXT: = memref.load %[[KERNEL_ARG3]][%[[TID]]] : memref<1xf32, 5>
413
414// -----
415
416// This test checks correctness of private attributions in the absence of workgroup attributions.
417// CHECK-LABEL: @launch_memory_attributions_1
418func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
419  %c1 = arith.constant 1 : index
420  %c2 = arith.constant 2 : index
421  %d = memref.dim %arg0, %c2 : memref<*xf32>
422  // CHECK: gpu.func {{.*}}  private(%[[KERNEL_ARG:.*]] : memref<3xf32, 5>) {{.*}} {
423  // CHECK:   %[[C2:.*]] = arith.constant 2 : index
424  // CHECK: = memref.load %[[KERNEL_ARG]][%[[C2]]] : memref<3xf32, 5>
425  // CHECK:   gpu.return
426  // CHECK: }
427  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
428                                       %grid_z = %c1)
429             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
430                                        %block_z = %c1)
431             private(%priv0: memref<3xf32, 5>) {
432    %42 = memref.load %priv0[%c2] : memref<3xf32, 5>
433    gpu.terminator
434  }
435  return
436}
437
438// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>}
439
440// -----
441// CHECK: module attributes {gpu.container_module}
442
443// CHECK-LABEL: func @launch_cluster()
444func.func @launch_cluster() {
445  // CHECK: %[[ARG0:.*]] = "op"() : () -> f32
446  %0 = "op"() : () -> (f32)
447  // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1>
448  %1 = "op"() : () -> (memref<?xf32, 1>)
449  // CHECK: %[[CDIMX:.*]] = arith.constant 1
450  %cDimX = arith.constant 1 : index
451  // CHECK: %[[CDIMY:.*]] = arith.constant 2
452  %cDimY = arith.constant 2 : index
453  // CHECK: %[[CDIMZ:.*]] = arith.constant 1
454  %cDimZ = arith.constant 1 : index
455  // CHECK: %[[GDIMX:.*]] = arith.constant 8
456  %gDimX = arith.constant 8 : index
457  // CHECK: %[[GDIMY:.*]] = arith.constant 12
458  %gDimY = arith.constant 12 : index
459  // CHECK: %[[GDIMZ:.*]] = arith.constant 16
460  %gDimZ = arith.constant 16 : index
461  // CHECK: %[[BDIMX:.*]] = arith.constant 20
462  %bDimX = arith.constant 20 : index
463  // CHECK: %[[BDIMY:.*]] = arith.constant 24
464  %bDimY = arith.constant 24 : index
465  // CHECK: %[[BDIMZ:.*]] = arith.constant 28
466  %bDimZ = arith.constant 28 : index
467
468  // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>)
469  // CHECK-NOT: gpu.launch blocks
470  gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY,
471                                       %cluster_z = %cDimZ)
472             blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
473                                       %grid_z = %gDimZ)
474             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
475                                        %block_z = %bDimZ) {
476    "use"(%0): (f32) -> ()
477    "some_op"(%cx, %bx, %block_x) : (index, index, index) -> ()
478    %42 = memref.load %1[%tx] : memref<?xf32, 1>
479    gpu.terminator
480  }
481  return
482}
483
484// CHECK-LABEL: gpu.module @launch_cluster_kernel
485// CHECK-NEXT: gpu.func @launch_cluster_kernel
486// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
487// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
488// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
489// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
490// CHECK-NEXT: = gpu.block_id y
491// CHECK-NEXT: = gpu.block_id z
492// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x
493// CHECK-NEXT: = gpu.thread_id y
494// CHECK-NEXT: = gpu.thread_id z
495// CHECK-NEXT: = gpu.grid_dim x
496// CHECK-NEXT: = gpu.grid_dim y
497// CHECK-NEXT: = gpu.grid_dim z
498// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
499// CHECK-NEXT: = gpu.block_dim y
500// CHECK-NEXT: = gpu.block_dim z
501// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x
502// CHECK-NEXT: = gpu.cluster_id y
503// CHECK-NEXT: = gpu.cluster_id z
504// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
505// CHECK-NEXT: = gpu.cluster_dim y
506// CHECK-NEXT: = gpu.cluster_dim z
507// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
508// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
509// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
510
511// -----
512// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch
513// CHECK-LABEL: func.func @testKernelAttributes()
514// CHECK: gpu.launch_func  @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
515// CHECK: gpu.module @test_module
516// CHECK: gpu.func @test_kernel_func()
517func.func @testKernelAttributes() {
518  %gDimX = arith.constant 8 : index
519  %gDimY = arith.constant 12 : index
520  %gDimZ = arith.constant 16 : index
521  %bDimX = arith.constant 32 : index
522  %bDimY = arith.constant 16 : index
523  %bDimZ = arith.constant 8 : index
524
525  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
526             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
527    "some_op"(%bx, %tx) : (index, index) -> ()
528    gpu.terminator
529  } {kernelModule = @test_module, kernelFunc = @test_kernel_func}
530  return
531}
532
533// -----
534// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists.
535
536// CHECK-LABEL: gpu.module @existing_module
537// CHECK: gpu.func @test_kernel_func()
538// CHECK: gpu.func @test_kernel_func_0()
539// CHECK-NOT: gpu.module @testExistingModule_kernel
540// CHECK-NOT: gpu.func @testExistingModule_kernel()
541// CHECK: func.func @testExistingModule()
542// CHECK: gpu.launch_func  @existing_module::@test_kernel_func_0 blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
543
544gpu.module @existing_module {
545  gpu.func @test_kernel_func() {
546    gpu.return
547  }
548}
549
550func.func @testExistingModule() {
551  %gDimX = arith.constant 8 : index
552  %gDimY = arith.constant 12 : index
553  %gDimZ = arith.constant 16 : index
554  %bDimX = arith.constant 32 : index
555  %bDimY = arith.constant 16 : index
556  %bDimZ = arith.constant 8 : index
557
558  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
559             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
560    "some_op"(%bx, %tx) : (index, index) -> ()
561    gpu.terminator
562  } {kernelModule = @existing_module, kernelFunc = @test_kernel_func}
563  return
564}
565
566// -----
567// This test tests the optional attribute kernelModule for gpu.launch.
568// CHECK-LABEL: func.func @testKernelModuleOnly()
569// CHECK: gpu.launch_func  @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
570// CHECK: gpu.module @test_module
571// CHECK: gpu.func @testKernelModuleOnly_kernel()
572func.func @testKernelModuleOnly() {
573  %gDimX = arith.constant 8 : index
574  %gDimY = arith.constant 12 : index
575  %gDimZ = arith.constant 16 : index
576  %bDimX = arith.constant 32 : index
577  %bDimY = arith.constant 16 : index
578  %bDimZ = arith.constant 8 : index
579
580  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
581             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
582    "some_op"(%bx, %tx) : (index, index) -> ()
583    gpu.terminator
584  } {kernelModule = @test_module}
585  return
586}
587
588// -----
589// This test tests the optional attribute kernelFunc for gpu.launch.
590// CHECK-LABEL: func.func @testKernelFuncOnly()
591// CHECK: gpu.launch_func  @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
592
593// CHECK: gpu.module @test_kernel_func
594// CHECK: gpu.func @test_kernel_func()
595func.func @testKernelFuncOnly() {
596  %gDimX = arith.constant 8 : index
597  %gDimY = arith.constant 12 : index
598  %gDimZ = arith.constant 16 : index
599  %bDimX = arith.constant 32 : index
600  %bDimY = arith.constant 16 : index
601  %bDimZ = arith.constant 8 : index
602
603  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
604             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
605    "some_op"(%bx, %tx) : (index, index) -> ()
606    gpu.terminator
607  } {kernelFunc = @test_kernel_func}
608  return
609}
610
611// -----
612// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified.
613// CHECK-LABEL: func.func @testNoAttributes()
614// CHECK: gpu.launch_func  @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]])
615
616// CHECK: gpu.module @testNoAttributes_kernel
617// CHECK: gpu.func @testNoAttributes_kernel()
618func.func @testNoAttributes() {
619  %gDimX = arith.constant 8 : index
620  %gDimY = arith.constant 12 : index
621  %gDimZ = arith.constant 16 : index
622  %bDimX = arith.constant 32 : index
623  %bDimY = arith.constant 16 : index
624  %bDimZ = arith.constant 8 : index
625
626  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ)
627             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) {
628    "some_op"(%bx, %tx) : (index, index) -> ()
629    gpu.terminator
630  }
631  return
632}
633