1// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s 2// Verify the printed output can be parsed. 3// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s 4// Verify the generic form can be parsed. 5// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s 6 7module attributes {gpu.container_module} { 8 9 // CHECK-LABEL:func @no_args(%{{.*}}: index) 10 func.func @no_args(%sz : index) { 11 // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) 12 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) 13 threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { 14 // CHECK: gpu.terminator 15 gpu.terminator 16 } 17 return 18 } 19 20 // CHECK-LABEL:func @args(%{{.*}}: index, %{{.*}}: index, %{{.*}}: f32, %{{.*}}: memref<?xf32, 1>) { 21 func.func @args(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) { 22 // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) 23 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) 24 threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { 25 "use"(%float) : (f32) -> () 26 "use"(%data) : (memref<?xf32,1>) -> () 27 // CHECK: gpu.terminator 28 gpu.terminator 29 } 30 return 31 } 32 33 // CHECK-LABEL:func @launch_async(%{{.*}}: index, %{{.*}}: index) { 34 func.func @launch_async(%blk : index, %thrd : index) { 35 // CHECK: gpu.launch async [%{{.+}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) 36 %t = gpu.wait async 37 %name = gpu.launch async [%t] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) 38 threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { 39 gpu.terminator 40 } 41 return 42 } 43 44 // CHECK-LABEL:func @launch_async_no_deps(%{{.*}}: index, %{{.*}}: index) { 45 func.func @launch_async_no_deps(%blk : index, %thrd : index) { 46 // CHECK: %{{.*}} = gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) 47 %t0 = gpu.launch async blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) 48 threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { 49 gpu.terminator 50 } 51 // CHECK: gpu.launch async blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) 52 %t1 = gpu.launch async [] blocks(%arg0, %arg1, %arg2) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk) 53 threads(%arg3, %arg4, %arg5) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd) { 54 gpu.terminator 55 } 56 return 57 } 58 59 gpu.module @kernels { 60 gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel { 61 %tIdX = gpu.thread_id x 62 // CHECK: thread_id_x 63 %tIdY = gpu.thread_id y 64 // CHECK-NEXT: thread_id_y 65 %tIdZ = gpu.thread_id z 66 // CHECK-NEXT: thread_id_z 67 68 %bDimX = gpu.block_dim x 69 // CHECK-NEXT: block_dim_x 70 %bDimY = gpu.block_dim y 71 // CHECK-NEXT: block_dim_y 72 %bDimZ = gpu.block_dim z 73 // CHECK-NEXT: block_dim_z 74 75 %bIdX = gpu.block_id x 76 // CHECK-NEXT: block_id_x 77 %bIdY = gpu.block_id y 78 // CHECK-NEXT: block_id_y 79 %bIdZ = gpu.block_id z 80 // CHECK-NEXT: block_id_z 81 82 %gDimX = gpu.grid_dim x 83 // CHECK-NEXT: grid_dim_x 84 %gDimY = gpu.grid_dim y 85 // CHECK-NEXT: grid_dim_y 86 %gDimZ = gpu.grid_dim z 87 // CHECK-NEXT: grid_dim_z 88 89 %gIdX = gpu.global_id x 90 // CHECK-NEXT: global_id_x 91 %gIdY = gpu.global_id y 92 // CHECK-NEXT: global_id_y 93 %gIdZ = gpu.global_id z 94 // CHECK-NEXT: global_id_z 95 96 %sgId = gpu.subgroup_id : index 97 %numSg = gpu.num_subgroups : index 98 %SgSi = gpu.subgroup_size : index 99 100 %one = arith.constant 1.0 : f32 101 102 %vec = vector.broadcast %arg0 : f32 to vector<4xf32> 103 104 // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} { 105 // CHECK-NEXT: } : (f32) -> f32 106 %sum = gpu.all_reduce add %one {} : (f32) -> (f32) 107 108 // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} uniform { 109 // CHECK-NEXT: } : (f32) -> f32 110 %sum1 = gpu.all_reduce add %one uniform {} : (f32) -> f32 111 112 // CHECK: %{{.*}} = gpu.all_reduce %{{.*}} { 113 // CHECK-NEXT: ^{{.*}}(%{{.*}}: f32, %{{.*}}: f32): 114 // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 115 // CHECK-NEXT: gpu.yield %{{.*}} : f32 116 // CHECK-NEXT: } : (f32) -> f32 117 %sum2 = gpu.all_reduce %one { 118 ^bb(%lhs : f32, %rhs : f32): 119 %tmp = arith.addf %lhs, %rhs : f32 120 gpu.yield %tmp : f32 121 } : (f32) -> (f32) 122 123 // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (f32) -> f32 124 %sum_subgroup = gpu.subgroup_reduce add %one : (f32) -> f32 125 126 // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32 127 %sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32 128 129 // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32> 130 %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32> 131 132 %width = arith.constant 7 : i32 133 %offset = arith.constant 3 : i32 134 // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32 135 %shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : f32 136 // CHECK: gpu.shuffle up %{{.*}}, %{{.*}}, %{{.*}} : f32 137 %shfl1, %pred1 = gpu.shuffle up %arg0, %offset, %width : f32 138 // CHECK: gpu.shuffle down %{{.*}}, %{{.*}}, %{{.*}} : f32 139 %shfl2, %pred2 = gpu.shuffle down %arg0, %offset, %width : f32 140 // CHECK: gpu.shuffle idx %{{.*}}, %{{.*}}, %{{.*}} : f32 141 %shfl3, %pred3 = gpu.shuffle idx %arg0, %offset, %width : f32 142 143 "gpu.barrier"() : () -> () 144 145 "some_op"(%bIdX, %tIdX) : (index, index) -> () 146 %42 = memref.load %arg1[%bIdX] : memref<?xf32, 1> 147 gpu.return 148 } 149 150 gpu.func @kernel_2() kernel { 151 gpu.return 152 } 153 } 154 155 gpu.binary @binary_1 [#gpu.object<#nvvm.target, "">] 156 157 gpu.binary @binary_2 <#gpu.select_object<#nvvm.target<chip = "sm_90">>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">] 158 159 gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target<chip = "sm_90">, "">] 160 161 gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">, 162 #gpu.object<#nvvm.target, assembly = "">, 163 #gpu.object<#nvvm.target, offload = "">, 164 #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = ""> 165 ] 166 167 // Check that fatbin gets ellided as it's the default format. 168 // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">] 169 gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">] 170 171 func.func private @two_value_generator() -> (f32, memref<?xf32, 1>) 172 173 func.func @foo() { 174 %0 = "op"() : () -> (f32) 175 %1 = "op"() : () -> (memref<?xf32, 1>) 176 // CHECK: %{{.*}} = arith.constant 8 177 %cst = arith.constant 8 : index 178 %cstI64 = arith.constant 8 : i64 179 %c0 = arith.constant 0 : i32 180 %t0 = gpu.wait async 181 %lowStream = llvm.mlir.zero : !llvm.ptr 182 183 // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>) 184 gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>) 185 186 // CHECK: gpu.launch_func @kernels::@kernel_1 clusters in (%{{.*}}, %{{.*}}, %{{.*}}) blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>) 187 gpu.launch_func @kernels::@kernel_1 clusters in (%cst, %cst, %cst) blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>) 188 189 gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref<?xf32, 1>) 190 191 // CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) 192 gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) 193 194 // CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) 195 %t1 = gpu.launch_func async [%t0] @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) 196 197 // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>) 198 gpu.launch_func <%lowStream : !llvm.ptr> @kernels::@kernel_1 blocks in (%cstI64, %cstI64, %cstI64) threads in (%cstI64, %cstI64, %cstI64) : i64 args(%0 : f32, %1 : memref<?xf32, 1>) 199 200 // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>) 201 gpu.launch_func @kernels::@kernel_1 blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>) 202 203 // CHECK: gpu.launch_func @binary_1::@kernel blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>) 204 gpu.launch_func @binary_1::@kernel blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref<?xf32, 1>) 205 206 // CHECK: %[[VALUES:.*]]:2 = call 207 %values:2 = func.call @two_value_generator() : () -> (f32, memref<?xf32, 1>) 208 // CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref<?xf32, 1>) 209 gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%values#0 : f32, %values#1 : memref<?xf32, 1>) 210 211 return 212 } 213 214 gpu.module @gpu_funcs { 215 // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) 216 // CHECK: workgroup 217 // CHECK: private 218 // CHECK: attributes 219 gpu.func @kernel_1(%arg0: f32) 220 workgroup(%arg1: memref<42xf32, 3>) 221 private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>) 222 kernel 223 attributes {foo="bar"} { 224 "use"(%arg1) : (memref<42xf32, 3>) -> () 225 "use"(%arg2) : (memref<2xf32, 5>) -> () 226 "use"(%arg3) : (memref<1xf32, 5>) -> () 227 gpu.return 228 } 229 230 // CHECK-LABEL: gpu.func @printf_test 231 // CHECK: (%[[ARG0:.*]]: i32) 232 // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32 233 gpu.func @printf_test(%arg0 : i32) { 234 gpu.printf "Value: %d", %arg0 : i32 235 gpu.return 236 } 237 238 // CHECK-LABEL: gpu.func @printf_empty 239 // CHECK: gpu.printf "]" 240 // CHECK: scf.if 241 // CHECK: gpu.printf ", " 242 gpu.func @printf_empty(%arg0 : i32) { 243 gpu.printf "]" 244 %1 = arith.cmpi slt, %arg0, %arg0 : i32 245 scf.if %1 { 246 gpu.printf ", " 247 } 248 gpu.return 249 } 250 251 // CHECK-LABEL: gpu.func @no_attribution 252 // CHECK: { 253 gpu.func @no_attribution(%arg0: f32) { 254 gpu.return 255 } 256 257 // CHECK-LABEL: @no_attribution_attrs 258 // CHECK: attributes 259 // CHECK: { 260 gpu.func @no_attribution_attrs(%arg0: f32) attributes {foo="bar"} { 261 gpu.return 262 } 263 264 // CHECK-LABEL: @workgroup_only 265 // CHECK: workgroup({{.*}}: {{.*}}) 266 // CHECK: { 267 gpu.func @workgroup_only() workgroup(%arg0: memref<42xf32, 3>) { 268 gpu.return 269 } 270 // CHECK-LABEL: @private_only 271 // CHECK: private({{.*}}: {{.*}}) 272 // CHECK: { 273 gpu.func @private_only() private(%arg0: memref<2xf32, 5>) { 274 gpu.return 275 } 276 277 // CHECK-LABEL: @empty_attribution 278 // CHECK: { 279 gpu.func @empty_attribution(%arg0: f32) workgroup() private() { 280 gpu.return 281 } 282 } 283 284 gpu.module @explicit_attributions { 285 // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32, {{.*}}: memref<?xf32>) workgroup({{.*}}: memref<5xf32, 3>) private({{.*}}: memref<5xf32, 5>) 286 "gpu.func"() ({ 287 ^bb0(%arg0: f32, %arg1: memref<?xf32>, %arg2: memref<5xf32, 3>, %arg3: memref<5xf32, 5>): 288 "gpu.return"() : () -> () 289 } ) {function_type = (f32, memref<?xf32>) -> (), gpu.kernel, sym_name = "kernel_1", workgroup_attributions = 1: i64} : () -> () 290 } 291 292 func.func @alloc() { 293 // CHECK-LABEL: func @alloc() 294 295 // CHECK: %[[m0:.*]] = gpu.alloc () : memref<13xf32, 1> 296 %m0 = gpu.alloc () : memref<13xf32, 1> 297 // CHECK: gpu.dealloc %[[m0]] : memref<13xf32, 1> 298 gpu.dealloc %m0 : memref<13xf32, 1> 299 300 %t0 = gpu.wait async 301 // CHECK: %[[m1:.*]], %[[t1:.*]] = gpu.alloc async [{{.*}}] () : memref<13xf32, 1> 302 %m1, %t1 = gpu.alloc async [%t0] () : memref<13xf32, 1> 303 // CHECK: gpu.dealloc async [%[[t1]]] %[[m1]] : memref<13xf32, 1> 304 %t2 = gpu.dealloc async [%t1] %m1 : memref<13xf32, 1> 305 306 // CHECK: %[[m2:.*]] = gpu.alloc host_shared () : memref<13xf32, 1> 307 %m2 = gpu.alloc host_shared () : memref<13xf32, 1> 308 // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1> 309 gpu.dealloc %m2 : memref<13xf32, 1> 310 311 return 312 } 313 314 func.func @async_token(%arg0 : !gpu.async.token) -> !gpu.async.token { 315 // CHECK-LABEL: func @async_token({{.*}}: !gpu.async.token) 316 // CHECK: return {{.*}} : !gpu.async.token 317 return %arg0 : !gpu.async.token 318 } 319 320 func.func @async_wait() { 321 // CHECK-LABEL: func @async_wait 322 // CHECK: %[[t0:.*]] = gpu.wait async 323 %0 = gpu.wait async 324 // CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]]] 325 %1 = gpu.wait async [%0] 326 // CHECK: %{{.*}} = gpu.wait async [%[[t0]], %[[t1]]] 327 %2 = gpu.wait async [%0, %1] 328 // CHECK: gpu.wait [%[[t0]], %[[t1]]] 329 // CHECK-NOT: async 330 gpu.wait [%0, %1] 331 // CHECK: gpu.wait 332 // CHECK-NOT: async 333 gpu.wait // Valid, but a no-op. 334 return 335 } 336 337 func.func @memcpy(%dst : memref<3x7xf32>, %src : memref<3x7xf32, 1>) { 338 // CHECK-LABEL: func @memcpy 339 // CHECK: gpu.memcpy {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> 340 gpu.memcpy %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> 341 // CHECK: %[[t0:.*]] = gpu.wait async 342 %0 = gpu.wait async 343 // CHECK: {{.*}} = gpu.memcpy async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, memref<3x7xf32, 1> 344 %1 = gpu.memcpy async [%0] %dst, %src : memref<3x7xf32>, memref<3x7xf32, 1> 345 return 346 } 347 348 func.func @memset(%dst : memref<3x7xf32>, %value : f32) { 349 // CHECK-LABEL: func @memset 350 // CHECK: gpu.memset {{.*}}, {{.*}} : memref<3x7xf32>, f32 351 gpu.memset %dst, %value : memref<3x7xf32>, f32 352 // CHECK: %[[t0:.*]] = gpu.wait async 353 %0 = gpu.wait async 354 // CHECK: {{.*}} = gpu.memset async [%[[t0]]] {{.*}}, {{.*}} : memref<3x7xf32>, f32 355 %1 = gpu.memset async [%0] %dst, %value : memref<3x7xf32>, f32 356 return 357 } 358 359 func.func @mmamatrix_valid_scalar_element_type(%src : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>>){ 360 // CHECK-LABEL: func @mmamatrix_valid_scalar_element_type 361 %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3> 362 // CHECK: %[[wg:.*]] = memref.alloca() 363 %i = arith.constant 16 : index 364 // CHECK: %[[i:.*]] = arith.constant 16 : index 365 %cst = arith.constant 1.000000e+00 : f32 366 // CHECK: %[[cst:.*]] = arith.constant 1.000000e+00 : f32 367 %0 = gpu.subgroup_mma_load_matrix %wg[%i, %i] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> 368 // CHECK: gpu.subgroup_mma_load_matrix %[[wg]][%[[i]], %[[i]]] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> 369 %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 64 : index} : memref<32x32xf16, affine_map<(d0, d1) -> (d0 * 64 + d1)>> -> !gpu.mma_matrix<16x16xf16, "AOp"> 370 // CHECK: gpu.subgroup_mma_load_matrix %{{.*}}[%[[i]], %[[i]]] {leadDimension = 64 : index} : memref<32x32xf16, #{{.*}}> -> !gpu.mma_matrix<16x16xf16, "AOp"> 371 %1 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp"> 372 // CHECK: gpu.subgroup_mma_elementwise addf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> 373 %2 = gpu.subgroup_mma_elementwise addf %1, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> 374 // CHECK: gpu.subgroup_mma_elementwise maxf %{{.*}}, %{{.*}} : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> 375 %3 = gpu.subgroup_mma_elementwise maxf %2, %1 : (!gpu.mma_matrix<16x16xf32, "COp">, !gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> 376 return 377 } 378 379 // CHECK-LABEL: func @mmamatrix_valid_vector_element_type 380 func.func @mmamatrix_valid_vector_element_type(%src : memref<32x4xvector<4xf32>>, %i : index) { 381 // CHECK: gpu.subgroup_mma_load_matrix 382 %s = gpu.subgroup_mma_load_matrix %src[%i, %i] {leadDimension = 4 : index} : memref<32x4xvector<4xf32>> -> !gpu.mma_matrix<16x16xf16, "COp"> 383 // CHECK: gpu.subgroup_mma_store_matrix 384 gpu.subgroup_mma_store_matrix %s, %src[%i, %i] {leadDimension = 4 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x4xvector<4xf32>> 385 return 386 } 387 388 // CHECK-LABEL: func @set_default_device 389 func.func @set_default_device(%arg0: i32) { 390 // CHECK: gpu.set_default_device 391 gpu.set_default_device %arg0 392 return 393 } 394 395 // CHECK-LABEL: func @sparse_ops 396 func.func @sparse_ops(%arg0: index) { 397 // CHECK: gpu.wait async 398 %token0 = gpu.wait async 399 // CHECK: gpu.alloc async 400 %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex> 401 // CHECK: gpu.alloc async 402 %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64> 403 // CHECK: gpu.create_coo async 404 %spmat, %token4 = gpu.create_coo async [%token2] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64> 405 // CHECK: gpu.create_csr async 406 %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64> 407 // CHECK: gpu.create_dn_tensor async 408 %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %mem2, %arg0 : index into memref<?xf64> 409 // CHECK: gpu.spmv_buffer_size async 410 %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec into f64 411 // CHECK: gpu.spmv async 412 %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64> into f64 413 // CHECK: gpu.create_dn_tensor async 414 %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %mem2, %arg0, %arg0 : index, index into memref<?xf64> 415 // CHECK: gpu.spmm_buffer_size async 416 %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64 417 // CHECK: gpu.spmm async 418 %token11 = gpu.spmm async [%token10] %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64> into f64 419 // CHECK: gpu.sddmm_buffer_size async 420 %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat into f64 421 // CHECK: gpu.sddmm async 422 %token13 = gpu.sddmm async [%token12] %dnmat, %dnmat, %spmat, %mem2 : memref<?xf64> into f64 423 // CHECK: gpu.destroy_dn_tensor async 424 %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat 425 // CHECK: gpu.destroy_sp_mat async 426 %token15 = gpu.destroy_sp_mat async [%token14] %spmat 427 // CHECK: gpu.destroy_dn_tensor async 428 %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec 429 // CHECK: gpu.wait 430 gpu.wait [%token16] 431 return 432 } 433} 434 435// Just check that this doesn't crash. 436gpu.module @module { 437 "gpu.func"() ({ 438 gpu.return 439 }) {function_type = () -> (), sym_name = "func"} : () -> () 440} 441 442// Check that this doesn't crash. 443gpu.module @module_with_one_target [#nvvm.target] { 444 gpu.func @kernel(%arg0 : f32) kernel { 445 gpu.return 446 } 447} 448 449gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">] { 450 gpu.func @kernel(%arg0 : f32) kernel { 451 gpu.return 452 } 453} 454 455gpu.module @module_with_offload_handler <#gpu.select_object<0>> [#nvvm.target] { 456} 457 458// Test kernel attributes 459gpu.binary @kernel_attrs_1 [ 460 #gpu.object<#rocdl.target<chip = "gfx900">, 461 kernels = #gpu.kernel_table<[ 462 #gpu.kernel_metadata<"kernel0", (i32, f32) -> (), metadata = {sgpr_count = 255}>, 463 #gpu.kernel_metadata<"kernel1", (i32) -> (), arg_attrs = [{llvm.read_only}]> 464 ]>, 465 bin = "BLOB"> 466 ] 467 468// Verify the kernels are sorted 469// CHECK-LABEL: gpu.binary @kernel_attrs_2 470gpu.binary @kernel_attrs_2 [ 471 // CHECK: [#gpu.kernel_metadata<"a_kernel", () -> ()>, #gpu.kernel_metadata<"m_kernel", () -> ()>, #gpu.kernel_metadata<"z_kernel", () -> ()>] 472 #gpu.object<#rocdl.target<chip = "gfx900">, 473 kernels = #gpu.kernel_table<[ 474 #gpu.kernel_metadata<"z_kernel", () -> ()>, 475 #gpu.kernel_metadata<"m_kernel", () -> ()>, 476 #gpu.kernel_metadata<"a_kernel", () -> ()> 477 ]>, 478 bin = "BLOB"> 479 ] 480 481// CHECK-LABEL: func @warp_execute_on_lane_0( 482func.func @warp_execute_on_lane_0(%laneid: index) { 483// CHECK-NEXT: gpu.warp_execute_on_lane_0(%{{.*}})[32] { 484 gpu.warp_execute_on_lane_0(%laneid)[32] { 485// CHECK-NEXT: } 486 } 487// CHECK-NEXT: return 488 return 489} 490 491// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d 492func.func @warp_execute_on_lane_0_2d(%laneid: index) { 493 // CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>) 494 %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) { 495 %0 = arith.constant dense<2>: vector<4x32xi32> 496 // CHECK: gpu.yield %{{.+}} : vector<4x32xi32> 497 gpu.yield %0 : vector<4x32xi32> 498 } 499 return 500} 501 502// CHECK-LABEL: func @warp_operand_result( 503func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) { 504// CHECK-NEXT: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) { 505 %2 = gpu.warp_execute_on_lane_0(%laneid)[32] 506 args(%v0 : vector<4xi32>) -> (vector<4xi32>) { 507 ^bb0(%arg0 : vector<128xi32>) : 508 %0 = arith.constant dense<2>: vector<128xi32> 509 %1 = arith.addi %arg0, %0 : vector<128xi32> 510// CHECK: gpu.yield %{{.*}} : vector<128xi32> 511 gpu.yield %1 : vector<128xi32> 512// CHECK-NEXT: } 513 } 514 return %2 : vector<4xi32> 515} 516