ops.mlir - OpenGrok cross reference for /llvm-project/mlir/test/Dialect/GPU/ops.mlir

Lines Matching refs:gpu
7 module attributes {gpu.container_module} {
11     // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
12     gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
14       // CHECK: gpu.terminator
15       gpu.terminator
22     // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
23     gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
27       // CHECK: gpu.terminator
28       gpu.terminator
35     // CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
36     %t = gpu.wait async
37     %name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
39       gpu.terminator
46     // CHECK: %{{.*}} = gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
47     %t0 = gpu.launch async blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
49       gpu.terminator
51     // CHECK: gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
52     %t1 = gpu.launch async [] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
54       gpu.terminator
59   gpu.module @kernels {
60     gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
61       %tIdX = gpu.thread_id x
63       %tIdY = gpu.thread_id y
65       %tIdZ = gpu.thread_id z
68       %bDimX = gpu.block_dim x
70       %bDimY = gpu.block_dim y
72       %bDimZ = gpu.block_dim z
75       %bIdX = gpu.block_id x
77       %bIdY = gpu.block_id y
79       %bIdZ = gpu.block_id z
82       %gDimX = gpu.grid_dim x
84       %gDimY = gpu.grid_dim y
86       %gDimZ = gpu.grid_dim z
89       %gIdX = gpu.global_id x
91       %gIdY = gpu.global_id y
93       %gIdZ = gpu.global_id z
96       %sgId = gpu.subgroup_id : index
97       %numSg = gpu.num_subgroups : index
98       %SgSi = gpu.subgroup_size : index
104       // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
106       %sum = gpu.all_reduce add %one {} : (f32) -> (f32)
108       // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} uniform {
110       %sum1 = gpu.all_reduce add %one uniform {} : (f32) -> f32
112       // CHECK: %{{.*}} = gpu.all_reduce %{{.*}} {
115       // CHECK-NEXT: gpu.yield %{{.*}} : f32
117       %sum2 = gpu.all_reduce %one { 
120         gpu.yield %tmp : f32
123       // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (f32) -> f32
124       %sum_subgroup = gpu.subgroup_reduce add %one : (f32) -> f32
126       // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
127       %sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
129       // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32>
130       %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
134       // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32
135       %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : f32
136       // CHECK: gpu.shuffle up %{{.*}}, %{{.*}}, %{{.*}} : f32
137       %shfl1, %pred1 = gpu.shuffle up %arg0, %offset, %width : f32
138       // CHECK: gpu.shuffle down %{{.*}}, %{{.*}}, %{{.*}} : f32
139       %shfl2, %pred2 = gpu.shuffle down %arg0, %offset, %width : f32
140       // CHECK: gpu.shuffle idx %{{.*}}, %{{.*}}, %{{.*}} : f32
141       %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32
143       "gpu.barrier"() : () -> ()
147       gpu.return
150     gpu.func @kernel_2() kernel {
151       gpu.return
155   gpu.binary @binary_1 [#gpu.object<#nvvm.target, "">]
157   gpu.binary @binary_2 <#gpu.select_object<#nvvm.target<chip = "sm_90">>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
159   gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">]
161   gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">,
162                         #gpu.object<#nvvm.target, assembly = "">,
163                         #gpu.object<#nvvm.target, offload = "">,
164                         #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = "">
168   // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">]
169   gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">]
180     %t0 = gpu.wait async
183     // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
184     gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
186     // CHECK: gpu.launch_func @kernels::@kernel_1 clusters in (%{{.*}}, %{{.*}}, %{{.*}}) blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
187     gpu.launch_func @kernels::@kernel_1 clusters in (%cst, %cst, %cst) blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
189     gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref<?xf32, 1>)
191     // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
192     gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
194     // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
195     %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2  blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
197     // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
198     gpu.launch_func <%lowStream : !llvm.ptr> @kernels::@kernel_1 blocks in (%cstI64, %cstI64, %cstI64) threads in (%cstI64, %cstI64, %cstI64) : i64 args(%0 : f32, %1 : memref<?xf32, 1>)
200     // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
201     gpu.launch_func @kernels::@kernel_1 blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>)
203     // CHECK: gpu.launch_func @binary_1::@kernel blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
204     gpu.launch_func @binary_1::@kernel blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>)
208     // CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref<?xf32, 1>)
209     gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%values#0 : f32, %values#1 : memref<?xf32, 1>)
214   gpu.module @gpu_funcs {
215     // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
219     gpu.func @kernel_1(%arg0: f32)
227       gpu.return
230     // CHECK-LABEL: gpu.func @printf_test
232     // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32
233     gpu.func @printf_test(%arg0 : i32) {
234       gpu.printf "Value: %d", %arg0 : i32
235       gpu.return
238     // CHECK-LABEL: gpu.func @printf_empty
239     // CHECK: gpu.printf  "]"
241     // CHECK: gpu.printf ", "
242     gpu.func @printf_empty(%arg0 : i32) {
243       gpu.printf "]"
246         gpu.printf ", "
248       gpu.return
251     // CHECK-LABEL: gpu.func @no_attribution
253     gpu.func @no_attribution(%arg0: f32) {
254       gpu.return
260     gpu.func @no_attribution_attrs(%arg0: f32) attributes {foo="bar"} {
261       gpu.return
267     gpu.func @workgroup_only() workgroup(%arg0: memref<42xf32, 3>) {
268       gpu.return
273     gpu.func @private_only() private(%arg0: memref<2xf32, 5>) {
274       gpu.return
279     gpu.func @empty_attribution(%arg0: f32) workgroup() private() {
280       gpu.return
284   gpu.module @explicit_attributions {
285     // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32, {{.*}}: memref<?xf32>) workgroup({{.*}}: memref<5xf32, 3>) private({{.*}}: memref<5xf32, 5>)
286     "gpu.func"() ({
288       "gpu.return"() : () -> ()
289     } ) {function_type = (f32, memref<?xf32>) -> (), gpu.kernel, sym_name = "kernel_1", workgroup_attributions = 1: i64} : () -> ()
295     // CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1>
296     %m0 = gpu.alloc () : memref<13xf32, 1>
297     // CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1>
298     gpu.dealloc %m0 : memref<13xf32, 1>
300     %t0 = gpu.wait async
301     // CHECK: %[[m1:.*]], %[[t1:.*]] = gpu.alloc async [{{.*}}] () : memref<13xf32, 1>
302     %m1, %t1 = gpu.alloc async [%t0] () : memref<13xf32, 1>
303     // CHECK: gpu.dealloc async [%[[t1]]] %[[m1]] : memref<13xf32, 1>
304     %t2 = gpu.dealloc async [%t1] %m1 : memref<13xf32, 1>
306     // CHECK: %[[m2:.*]] = gpu.alloc host_shared () : memref<13xf32, 1>
307     %m2 = gpu.alloc host_shared () : memref<13xf32, 1>
308     // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1>
309     gpu.dealloc %m2 : memref<13xf32, 1>
314   func.func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token {
315     // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token)
316     // CHECK: return {{.*}} : !gpu.async.token
317     return %arg0 : !gpu.async.token
322     // CHECK: %[[t0:.*]] = gpu.wait async
323     %0 = gpu.wait async
324     // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]]
325     %1 = gpu.wait async [%0]
326     // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]]
327     %2 = gpu.wait async [%0, %1]
328     // CHECK: gpu.wait [%[[t0]], %[[t1]]]
330     gpu.wait [%0, %1]
331     // CHECK: gpu.wait
333     gpu.wait // Valid, but a no-op.
339     // CHECK: gpu.memcpy {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1>
340     gpu.memcpy %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1>
341     // CHECK: %[[t0:.*]] = gpu.wait async
342     %0 = gpu.wait async
343     // CHECK: {{.*}} = gpu.memcpy async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1>
344     %1 = gpu.memcpy async [%0] %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1>
350     // CHECK: gpu.memset {{.*}}, {{.*}} : memref<3x7xf32>, f32
351     gpu.memset %dst, %value : memref<3x7xf32>, f32
352     // CHECK: %[[t0:.*]] = gpu.wait async
353     %0 = gpu.wait async
354     // CHECK: {{.*}} = gpu.memset async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, f32
355     %1 = gpu.memset async [%0] %dst, %value : memref<3x7xf32>, f32
367     %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
368     // CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
369     %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 64 : index} : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>> -> !gpu.mma_matrix<16x16xf16, "AOp">
370     // CHECK: gpu.subgroup_mma_load_matrix %{{.*}}[%[[i]], %[[i]]] {leadDimension = 64 : index} : memref<32x32xf16, #{{.*}}> -> !gpu.mma_matrix<16x16xf16, "AOp">
371     %1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
372     // CHECK: gpu.subgroup_mma_elementwise addf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
373     %2 = gpu.subgroup_mma_elementwise addf %1, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
374     // CHECK: gpu.subgroup_mma_elementwise maxf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
375     %3 = gpu.subgroup_mma_elementwise maxf %2, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp">
381     // CHECK: gpu.subgroup_mma_load_matrix
382     %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4xf32>> -> !gpu.mma_matrix<16x16xf16, "COp">
383     // CHECK: gpu.subgroup_mma_store_matrix
384     gpu.subgroup_mma_store_matrix %s, %src[%i, %i] {leadDimension = 4 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x4xvector<4xf32>>
390     // CHECK: gpu.set_default_device
391     gpu.set_default_device %arg0
397     // CHECK: gpu.wait async
398     %token0 = gpu.wait async
399     // CHECK: gpu.alloc async
400     %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
401     // CHECK: gpu.alloc async
402     %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
403     // CHECK: gpu.create_coo async
404     %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
405     // CHECK: gpu.create_csr async
406     %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
407     // CHECK: gpu.create_dn_tensor async
408     %dnvec, %token6 = gpu.create_dn_tensor async [%token5]  %mem2, %arg0 : index into memref<?xf64>
409     // CHECK: gpu.spmv_buffer_size async
410     %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec  into f64
411     // CHECK: gpu.spmv async
412     %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>  into f64
413     // CHECK: gpu.create_dn_tensor async
414     %dnmat, %token9 = gpu.create_dn_tensor async [%token8]  %mem2, %arg0, %arg0 : index, index into memref<?xf64>
415     // CHECK: gpu.spmm_buffer_size async
416     %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64
417     // CHECK: gpu.spmm async
418     %token11 = gpu.spmm async [%token10]  %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>  into f64
419     // CHECK: gpu.sddmm_buffer_size async
420     %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat  into f64
421     // CHECK: gpu.sddmm async
422     %token13 = gpu.sddmm async [%token12]  %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64>  into f64
423     // CHECK: gpu.destroy_dn_tensor async
424     %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat
425     // CHECK: gpu.destroy_sp_mat async
426     %token15 = gpu.destroy_sp_mat async [%token14] %spmat
427     // CHECK: gpu.destroy_dn_tensor async
428     %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec
429     // CHECK: gpu.wait
430     gpu.wait [%token16]
436 gpu.module @module {
437   "gpu.func"() ({
438     gpu.return
443 gpu.module @module_with_one_target [#nvvm.target] {
444   gpu.func @kernel(%arg0 : f32) kernel {
445     gpu.return
449 gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">] {
450   gpu.func @kernel(%arg0 : f32) kernel {
451     gpu.return
455 gpu.module @module_with_offload_handler <#gpu.select_object<0>> [#nvvm.target] {
459 gpu.binary @kernel_attrs_1 [
460     #gpu.object<#rocdl.target<chip = "gfx900">,
461       kernels = #gpu.kernel_table<[
462         #gpu.kernel_metadata<"kernel0", (i32, f32) -> (), metadata = {sgpr_count = 255}>,
463         #gpu.kernel_metadata<"kernel1", (i32) -> (), arg_attrs = [{llvm.read_only}]>
469 // CHECK-LABEL: gpu.binary @kernel_attrs_2
470 gpu.binary @kernel_attrs_2 [
471     // CHECK: [#gpu.kernel_metadata<"a_kernel", () -> ()>, #gpu.kernel_metadata<"m_kernel", () -> ()>, #gpu.kernel_metadata<"z_kernel", () -> ()>]
472     #gpu.object<#rocdl.target<chip = "gfx900">,
473       kernels = #gpu.kernel_table<[
474         #gpu.kernel_metadata<"z_kernel", () -> ()>,
475         #gpu.kernel_metadata<"m_kernel", () -> ()>,
476         #gpu.kernel_metadata<"a_kernel", () -> ()>
483 //  CHECK-NEXT:     gpu.warp_execute_on_lane_0(%{{.*}})[32] {
484   gpu.warp_execute_on_lane_0(%laneid)[32] {
493   //  CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>)
494   %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) {
496     // CHECK: gpu.yield %{{.+}} : vector<4x32xi32>
497     gpu.yield %0 : vector<4x32xi32>
504 //  CHECK-NEXT:     %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) {
505   %2 = gpu.warp_execute_on_lane_0(%laneid)[32]
510 //       CHECK:       gpu.yield %{{.*}} : vector<128xi32>
511     gpu.yield %1 : vector<128xi32>