xref: /llvm-project/mlir/test/Dialect/GPU/ops.mlir (revision f50f9698ad012882df8dd605f5482e280c138266)
1// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s
2// Verify the printed output can be parsed.
3// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
4// Verify the generic form can be parsed.
5// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
6
7module attributes {gpu.container_module} {
8
9  // CHECK-LABEL:func @no_args(%{{.*}}: index)
10  func.func @no_args(%sz : index) {
11    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
12    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
13               threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
14      // CHECK: gpu.terminator
15      gpu.terminator
16    }
17    return
18  }
19
20  // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) {
21  func.func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
22    // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
23    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
24               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
25      "use"(%float) : (f32) -> ()
26      "use"(%data) : (memref<?xf32,1>) -> ()
27      // CHECK: gpu.terminator
28      gpu.terminator
29    }
30    return
31  }
32
33  // CHECK-LABEL:func @launch_async(%{{.*}}: index, %{{.*}}: index) {
34  func.func @launch_async(%blk : index, %thrd : index) {
35    // CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
36    %t = gpu.wait async
37    %name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
38               threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
39      gpu.terminator
40    }
41    return
42  }
43
44  // CHECK-LABEL:func @launch_async_no_deps(%{{.*}}: index, %{{.*}}: index) {
45  func.func @launch_async_no_deps(%blk : index, %thrd : index) {
46    // CHECK: %{{.*}} = gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
47    %t0 = gpu.launch async blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
48               threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
49      gpu.terminator
50    }
51    // CHECK: gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
52    %t1 = gpu.launch async [] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
53               threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) {
54      gpu.terminator
55    }
56    return
57  }
58
59  gpu.module @kernels {
60    gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
61      %tIdX = gpu.thread_id x
62      // CHECK:      thread_id_x
63      %tIdY = gpu.thread_id y
64      // CHECK-NEXT: thread_id_y
65      %tIdZ = gpu.thread_id z
66      // CHECK-NEXT: thread_id_z
67
68      %bDimX = gpu.block_dim x
69      // CHECK-NEXT: block_dim_x
70      %bDimY = gpu.block_dim y
71      // CHECK-NEXT: block_dim_y
72      %bDimZ = gpu.block_dim z
73      // CHECK-NEXT: block_dim_z
74
75      %bIdX = gpu.block_id x
76      // CHECK-NEXT: block_id_x
77      %bIdY = gpu.block_id y
78      // CHECK-NEXT: block_id_y
79      %bIdZ = gpu.block_id z
80      // CHECK-NEXT: block_id_z
81
82      %gDimX = gpu.grid_dim x
83      // CHECK-NEXT: grid_dim_x
84      %gDimY = gpu.grid_dim y
85      // CHECK-NEXT: grid_dim_y
86      %gDimZ = gpu.grid_dim z
87      // CHECK-NEXT: grid_dim_z
88
89      %gIdX = gpu.global_id x
90      // CHECK-NEXT: global_id_x
91      %gIdY = gpu.global_id y
92      // CHECK-NEXT: global_id_y
93      %gIdZ = gpu.global_id z
94      // CHECK-NEXT: global_id_z
95
96      %sgId = gpu.subgroup_id : index
97      %numSg = gpu.num_subgroups : index
98      %SgSi = gpu.subgroup_size : index
99
100      %one = arith.constant 1.0 : f32
101
102      %vec = vector.broadcast %arg0 : f32 to vector<4xf32>
103
104      // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
105      // CHECK-NEXT: } : (f32) -> f32
106      %sum = gpu.all_reduce add %one {} : (f32) -> (f32)
107
108      // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} uniform {
109      // CHECK-NEXT: } : (f32) -> f32
110      %sum1 = gpu.all_reduce add %one uniform {} : (f32) -> f32
111
112      // CHECK: %{{.*}} = gpu.all_reduce %{{.*}} {
113      // CHECK-NEXT: ^{{.*}}(%{{.*}}: f32, %{{.*}}: f32):
114      // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
115      // CHECK-NEXT: gpu.yield %{{.*}} : f32
116      // CHECK-NEXT: } : (f32) -> f32
117      %sum2 = gpu.all_reduce %one {
118      ^bb(%lhs : f32, %rhs : f32):
119        %tmp = arith.addf %lhs, %rhs : f32
120        gpu.yield %tmp : f32
121      } : (f32) -> (f32)
122
123      // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (f32) -> f32
124      %sum_subgroup = gpu.subgroup_reduce add %one : (f32) -> f32
125
126      // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
127      %sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
128
129      // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32>
130      %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
131
132      %width = arith.constant 7 : i32
133      %offset = arith.constant 3 : i32
134      // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32
135      %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : f32
136      // CHECK: gpu.shuffle up %{{.*}}, %{{.*}}, %{{.*}} : f32
137      %shfl1, %pred1 = gpu.shuffle up %arg0, %offset, %width : f32
138      // CHECK: gpu.shuffle down %{{.*}}, %{{.*}}, %{{.*}} : f32
139      %shfl2, %pred2 = gpu.shuffle down %arg0, %offset, %width : f32
140      // CHECK: gpu.shuffle idx %{{.*}}, %{{.*}}, %{{.*}} : f32
141      %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32
142
143      "gpu.barrier"() : () -> ()
144
145      "some_op"(%bIdX, %tIdX) : (index, index) -> ()
146      %42 = memref.load %arg1[%bIdX] : memref<?xf32, 1>
147      gpu.return
148    }
149
150    gpu.func @kernel_2() kernel {
151      gpu.return
152    }
153  }
154
155  gpu.binary @binary_1 [#gpu.object<#nvvm.target, "">]
156
157  gpu.binary @binary_2 <#gpu.select_object<#nvvm.target<chip = "sm_90">>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
158
159  gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
160
161  gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">,
162                        #gpu.object<#nvvm.target, assembly = "">,
163                        #gpu.object<#nvvm.target, offload = "">,
164                        #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = "">
165                        ]
166
167  // Check that fatbin gets ellided as it's the default format.
168  // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">]
169  gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">]
170
171  func.func private @two_value_generator() -> (f32, memref<?xf32, 1>)
172
173  func.func @foo() {
174    %0 = "op"() : () -> (f32)
175    %1 = "op"() : () -> (memref<?xf32, 1>)
176    // CHECK: %{{.*}} = arith.constant 8
177    %cst = arith.constant 8 : index
178    %cstI64 = arith.constant 8 : i64
179    %c0 = arith.constant 0 : i32
180    %t0 = gpu.wait async
181    %lowStream = llvm.mlir.zero : !llvm.ptr
182
183    // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
184    gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
185
186    // CHECK: gpu.launch_func @kernels::@kernel_1 clusters in (%{{.*}}, %{{.*}}, %{{.*}}) blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
187    gpu.launch_func @kernels::@kernel_1 clusters in (%cst, %cst, %cst) blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
188
189    gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref<?xf32, 1>)
190
191    // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
192    gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
193
194    // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
195    %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2  blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
196
197    // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
198    gpu.launch_func <%lowStream : !llvm.ptr> @kernels::@kernel_1 blocks in (%cstI64, %cstI64, %cstI64) threads in (%cstI64, %cstI64, %cstI64) : i64 args(%0 : f32, %1 : memref<?xf32, 1>)
199
200    // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
201    gpu.launch_func @kernels::@kernel_1 blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>)
202
203    // CHECK: gpu.launch_func @binary_1::@kernel blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
204    gpu.launch_func @binary_1::@kernel blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>)
205
206    // CHECK: %[[VALUES:.*]]:2 = call
207    %values:2 = func.call @two_value_generator() : () -> (f32, memref<?xf32, 1>)
208    // CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref<?xf32, 1>)
209    gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%values#0 : f32, %values#1 : memref<?xf32, 1>)
210
211    return
212  }
213
214  gpu.module @gpu_funcs {
215    // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
216    // CHECK:       workgroup
217    // CHECK:       private
218    // CHECK:       attributes
219    gpu.func @kernel_1(%arg0: f32)
220        workgroup(%arg1: memref<42xf32, 3>)
221        private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
222        kernel
223        attributes {foo="bar"} {
224      "use"(%arg1) : (memref<42xf32, 3>) -> ()
225      "use"(%arg2) : (memref<2xf32, 5>) -> ()
226      "use"(%arg3) : (memref<1xf32, 5>) -> ()
227      gpu.return
228    }
229
230    // CHECK-LABEL: gpu.func @printf_test
231    // CHECK: (%[[ARG0:.*]]: i32)
232    // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32
233    gpu.func @printf_test(%arg0 : i32) {
234      gpu.printf "Value: %d", %arg0 : i32
235      gpu.return
236    }
237
238    // CHECK-LABEL: gpu.func @printf_empty
239    // CHECK: gpu.printf  "]"
240    // CHECK: scf.if
241    // CHECK: gpu.printf ", "
242    gpu.func @printf_empty(%arg0 : i32) {
243      gpu.printf "]"
244      %1 = arith.cmpi slt, %arg0, %arg0 : i32
245      scf.if %1 {
246        gpu.printf ", "
247      }
248      gpu.return
249    }
250
251    // CHECK-LABEL: gpu.func @no_attribution
252    // CHECK: {
253    gpu.func @no_attribution(%arg0: f32) {
254      gpu.return
255    }
256
257    // CHECK-LABEL: @no_attribution_attrs
258    // CHECK:       attributes
259    // CHECK:       {
260    gpu.func @no_attribution_attrs(%arg0: f32) attributes {foo="bar"} {
261      gpu.return
262    }
263
264    // CHECK-LABEL: @workgroup_only
265    // CHECK:       workgroup({{.*}}: {{.*}})
266    // CHECK:       {
267    gpu.func @workgroup_only() workgroup(%arg0: memref<42xf32, 3>) {
268      gpu.return
269    }
270    // CHECK-LABEL: @private_only
271    // CHECK:       private({{.*}}: {{.*}})
272    // CHECK:       {
273    gpu.func @private_only() private(%arg0: memref<2xf32, 5>) {
274      gpu.return
275    }
276
277    // CHECK-LABEL: @empty_attribution
278    // CHECK:       {
279    gpu.func @empty_attribution(%arg0: f32) workgroup() private() {
280      gpu.return
281    }
282  }
283
284  gpu.module @explicit_attributions {
285    // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32, {{.*}}: memref<?xf32>) workgroup({{.*}}: memref<5xf32, 3>) private({{.*}}: memref<5xf32, 5>)
286    "gpu.func"() ({
287    ^bb0(%arg0: f32, %arg1: memref<?xf32>, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>):
288      "gpu.return"() : () -> ()
289    } ) {function_type = (f32, memref<?xf32>) -> (), gpu.kernel, sym_name = "kernel_1", workgroup_attributions = 1: i64} : () -> ()
290  }
291
292  func.func @alloc() {
293    // CHECK-LABEL: func @alloc()
294
295    // CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1>
296    %m0 = gpu.alloc () : memref<13xf32, 1>
297    // CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1>
298    gpu.dealloc %m0 : memref<13xf32, 1>
299
300    %t0 = gpu.wait async
301    // CHECK: %[[m1:.*]], %[[t1:.*]] = gpu.alloc async [{{.*}}] () : memref<13xf32, 1>
302    %m1, %t1 = gpu.alloc async [%t0] () : memref<13xf32, 1>
303    // CHECK: gpu.dealloc async [%[[t1]]] %[[m1]] : memref<13xf32, 1>
304    %t2 = gpu.dealloc async [%t1] %m1 : memref<13xf32, 1>
305
306    // CHECK: %[[m2:.*]] = gpu.alloc host_shared () : memref<13xf32, 1>
307    %m2 = gpu.alloc host_shared () : memref<13xf32, 1>
308    // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1>
309    gpu.dealloc %m2 : memref<13xf32, 1>
310
311    return
312  }
313
314  func.func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token {
315    // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token)
316    // CHECK: return {{.*}} : !gpu.async.token
317    return %arg0 : !gpu.async.token
318  }
319
320  func.func @async_wait() {
321    // CHECK-LABEL: func @async_wait
322    // CHECK: %[[t0:.*]] = gpu.wait async
323    %0 = gpu.wait async
324    // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]]
325    %1 = gpu.wait async [%0]
326    // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]]
327    %2 = gpu.wait async [%0, %1]
328    // CHECK: gpu.wait [%[[t0]], %[[t1]]]
329    // CHECK-NOT: async
330    gpu.wait [%0, %1]
331    // CHECK: gpu.wait
332    // CHECK-NOT: async
333    gpu.wait // Valid, but a no-op.
334    return
335  }
336
337  func.func @memcpy(%dst : memref<3x7xf32>, %src : memref<3x7xf32, 1>) {
338    // CHECK-LABEL: func @memcpy
339    // CHECK: gpu.memcpy {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1>
340    gpu.memcpy %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1>
341    // CHECK: %[[t0:.*]] = gpu.wait async
342    %0 = gpu.wait async
343    // CHECK: {{.*}} = gpu.memcpy async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1>
344    %1 = gpu.memcpy async [%0] %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1>
345    return
346  }
347
348  func.func @memset(%dst : memref<3x7xf32>, %value : f32) {
349    // CHECK-LABEL: func @memset
350    // CHECK: gpu.memset {{.*}}, {{.*}} : memref<3x7xf32>, f32
351    gpu.memset %dst, %value : memref<3x7xf32>, f32
352    // CHECK: %[[t0:.*]] = gpu.wait async
353    %0 = gpu.wait async
354    // CHECK: {{.*}} = gpu.memset async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, f32
355    %1 = gpu.memset async [%0] %dst, %value : memref<3x7xf32>, f32
356    return
357  }
358
359  func.func @mmamatrix_valid_scalar_element_type(%src : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>>){
360    // CHECK-LABEL: func @mmamatrix_valid_scalar_element_type
361    %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
362    // CHECK: %[[wg:.*]] = memref.alloca()
363    %i = arith.constant 16 : index
364    // CHECK: %[[i:.*]] = arith.constant 16 : index
365     %cst = arith.constant 1.000000e+00 : f32
366    // CHECK: %[[cst:.*]] = arith.constant 1.000000e+00 : f32
367    %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
368    // CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
369    %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 64 : index} : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>> -> !gpu.mma_matrix<16x16xf16, "AOp">
370    // CHECK: gpu.subgroup_mma_load_matrix %{{.*}}[%[[i]], %[[i]]] {leadDimension = 64 : index} : memref<32x32xf16, #{{.*}}> -> !gpu.mma_matrix<16x16xf16, "AOp">
371    %1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
372    // CHECK: gpu.subgroup_mma_elementwise addf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
373    %2 = gpu.subgroup_mma_elementwise addf %1, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
374    // CHECK: gpu.subgroup_mma_elementwise maxf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
375    %3 = gpu.subgroup_mma_elementwise maxf %2, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
376    return
377  }
378
379  // CHECK-LABEL: func @mmamatrix_valid_vector_element_type
380  func.func @mmamatrix_valid_vector_element_type(%src : memref<32x4xvector<4xf32>>, %i : index) {
381    // CHECK: gpu.subgroup_mma_load_matrix
382    %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4xf32>> -> !gpu.mma_matrix<16x16xf16, "COp">
383    // CHECK: gpu.subgroup_mma_store_matrix
384    gpu.subgroup_mma_store_matrix %s, %src[%i, %i] {leadDimension = 4 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x4xvector<4xf32>>
385    return
386  }
387
388  // CHECK-LABEL: func @set_default_device
389  func.func @set_default_device(%arg0: i32) {
390    // CHECK: gpu.set_default_device
391    gpu.set_default_device %arg0
392    return
393  }
394
395  // CHECK-LABEL: func @sparse_ops
396  func.func @sparse_ops(%arg0: index) {
397    // CHECK: gpu.wait async
398    %token0 = gpu.wait async
399    // CHECK: gpu.alloc async
400    %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
401    // CHECK: gpu.alloc async
402    %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
403    // CHECK: gpu.create_coo async
404    %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
405    // CHECK: gpu.create_csr async
406    %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
407    // CHECK: gpu.create_dn_tensor async
408    %dnvec, %token6 = gpu.create_dn_tensor async [%token5]  %mem2, %arg0 : index into memref<?xf64>
409    // CHECK: gpu.spmv_buffer_size async
410    %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec  into f64
411    // CHECK: gpu.spmv async
412    %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
413    // CHECK: gpu.create_dn_tensor async
414    %dnmat, %token9 = gpu.create_dn_tensor async [%token8]  %mem2, %arg0, %arg0 : index, index into memref<?xf64>
415    // CHECK: gpu.spmm_buffer_size async
416    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64
417    // CHECK: gpu.spmm async
418    %token11 = gpu.spmm async [%token10]  %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>  into f64
419    // CHECK: gpu.sddmm_buffer_size async
420    %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat  into f64
421    // CHECK: gpu.sddmm async
422    %token13 = gpu.sddmm async [%token12]  %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64>  into f64
423    // CHECK: gpu.destroy_dn_tensor async
424    %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
425    // CHECK: gpu.destroy_sp_mat async
426    %token15 = gpu.destroy_sp_mat async [%token14] %spmat
427    // CHECK: gpu.destroy_dn_tensor async
428    %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
429    // CHECK: gpu.wait
430    gpu.wait [%token16]
431    return
432  }
433}
434
435// Just check that this doesn't crash.
436gpu.module @module {
437  "gpu.func"() ({
438    gpu.return
439  }) {function_type = () -> (), sym_name = "func"} : () -> ()
440}
441
442// Check that this doesn't crash.
443gpu.module @module_with_one_target [#nvvm.target] {
444  gpu.func @kernel(%arg0 : f32) kernel {
445    gpu.return
446  }
447}
448
449gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">] {
450  gpu.func @kernel(%arg0 : f32) kernel {
451    gpu.return
452  }
453}
454
455gpu.module @module_with_offload_handler <#gpu.select_object<0>> [#nvvm.target] {
456}
457
458// Test kernel attributes
459gpu.binary @kernel_attrs_1 [
460    #gpu.object<#rocdl.target<chip = "gfx900">,
461      kernels = #gpu.kernel_table<[
462        #gpu.kernel_metadata<"kernel0", (i32, f32) -> (), metadata = {sgpr_count = 255}>,
463        #gpu.kernel_metadata<"kernel1", (i32) -> (), arg_attrs = [{llvm.read_only}]>
464      ]>,
465      bin = "BLOB">
466  ]
467
468// Verify the kernels are sorted
469// CHECK-LABEL: gpu.binary @kernel_attrs_2
470gpu.binary @kernel_attrs_2 [
471    // CHECK: [#gpu.kernel_metadata<"a_kernel", () -> ()>, #gpu.kernel_metadata<"m_kernel", () -> ()>, #gpu.kernel_metadata<"z_kernel", () -> ()>]
472    #gpu.object<#rocdl.target<chip = "gfx900">,
473      kernels = #gpu.kernel_table<[
474        #gpu.kernel_metadata<"z_kernel", () -> ()>,
475        #gpu.kernel_metadata<"m_kernel", () -> ()>,
476        #gpu.kernel_metadata<"a_kernel", () -> ()>
477      ]>,
478      bin = "BLOB">
479  ]
480
481// CHECK-LABEL:   func @warp_execute_on_lane_0(
482func.func @warp_execute_on_lane_0(%laneid: index) {
483//  CHECK-NEXT:     gpu.warp_execute_on_lane_0(%{{.*}})[32] {
484  gpu.warp_execute_on_lane_0(%laneid)[32] {
485//  CHECK-NEXT:     }
486  }
487//  CHECK-NEXT:     return
488  return
489}
490
491// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d
492func.func @warp_execute_on_lane_0_2d(%laneid: index) {
493  //  CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>)
494  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) {
495    %0 = arith.constant dense<2>: vector<4x32xi32>
496    // CHECK: gpu.yield %{{.+}} : vector<4x32xi32>
497    gpu.yield %0 : vector<4x32xi32>
498  }
499  return
500}
501
502// CHECK-LABEL:   func @warp_operand_result(
503func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) {
504//  CHECK-NEXT:     %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) {
505  %2 = gpu.warp_execute_on_lane_0(%laneid)[32]
506  args(%v0 : vector<4xi32>) -> (vector<4xi32>) {
507   ^bb0(%arg0 : vector<128xi32>) :
508    %0 = arith.constant dense<2>: vector<128xi32>
509    %1 = arith.addi %arg0, %0 : vector<128xi32>
510//       CHECK:       gpu.yield %{{.*}} : vector<128xi32>
511    gpu.yield %1 : vector<128xi32>
512//  CHECK-NEXT:     }
513  }
514  return %2 : vector<4xi32>
515}
516