1// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s 2// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s 3 4// CHECK: module attributes {gpu.container_module} 5 6// CHECK-LABEL: func @launch() 7func.func @launch() { 8 // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 9 %0 = "op"() : () -> (f32) 10 // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1> 11 %1 = "op"() : () -> (memref<?xf32, 1>) 12 // CHECK: %[[GDIMX:.*]] = arith.constant 8 13 %gDimX = arith.constant 8 : index 14 // CHECK: %[[GDIMY:.*]] = arith.constant 12 15 %gDimY = arith.constant 12 : index 16 // CHECK: %[[GDIMZ:.*]] = arith.constant 16 17 %gDimZ = arith.constant 16 : index 18 // CHECK: %[[BDIMX:.*]] = arith.constant 20 19 %bDimX = arith.constant 20 : index 20 // CHECK: %[[BDIMY:.*]] = arith.constant 24 21 %bDimY = arith.constant 24 : index 22 // CHECK: %[[BDIMZ:.*]] = arith.constant 28 23 %bDimZ = arith.constant 28 : index 24 25 // CHECK: gpu.launch_func @launch_kernel::@launch_kernel blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>) 26 // CHECK-NOT: gpu.launch blocks 27 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, 28 %grid_z = %gDimZ) 29 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, 30 %block_z = %bDimZ) { 31 "use"(%0): (f32) -> () 32 "some_op"(%bx, %block_x) : (index, index) -> () 33 %42 = memref.load %1[%tx] : memref<?xf32, 1> 34 gpu.terminator 35 } 36 return 37} 38 39// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 40// CHECK-LABEL: gpu.module @launch_kernel 41// CHECK-NEXT: gpu.func @launch_kernel 42// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>) 43// CHECK-SAME: known_block_size = array<i32: 20, 24, 28> 44// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16> 45// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x 46// CHECK-NEXT: = gpu.block_id y 47// CHECK-NEXT: = gpu.block_id z 48// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x 49// CHECK-NEXT: = gpu.thread_id y 50// CHECK-NEXT: = gpu.thread_id z 51// CHECK-NEXT: = gpu.grid_dim x 52// CHECK-NEXT: = gpu.grid_dim y 53// CHECK-NEXT: = gpu.grid_dim z 54// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x 55// CHECK-NEXT: = gpu.block_dim y 56// CHECK-NEXT: = gpu.block_dim z 57// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () 58// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> () 59// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1> 60 61// ----- 62 63// Verify that we can outline a CFG 64// CHECK-LABEL: gpu.func @launchCFG_kernel( 65// CHECK: cf.br 66// CHECK: gpu.return 67func.func @launchCFG() { 68 %0 = "op"() : () -> (f32) 69 %1 = "op"() : () -> (memref<?xf32, 1>) 70 %gDimX = arith.constant 8 : index 71 %gDimY = arith.constant 12 : index 72 %gDimZ = arith.constant 16 : index 73 %bDimX = arith.constant 20 : index 74 %bDimY = arith.constant 24 : index 75 %bDimZ = arith.constant 28 : index 76 77 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, 78 %grid_z = %gDimZ) 79 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, 80 %block_z = %bDimZ) { 81 "use"(%0): (f32) -> () 82 cf.br ^bb1 83 ^bb1: 84 "some_op"(%bx, %block_x) : (index, index) -> () 85 %42 = memref.load %1[%tx] : memref<?xf32, 1> 86 gpu.terminator 87 } 88 return 89} 90 91 92// ----- 93 94// This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func 95// CHECK-LABEL: @launch_from_llvm_func 96llvm.func @launch_from_llvm_func() { 97 // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 98 %0 = "op"() : () -> (f32) 99 // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1> 100 %1 = "op"() : () -> (memref<?xf32, 1>) 101 102 // CHECK: %[[DIM:.*]] = arith.constant 1 103 %dim = arith.constant 1 : index 104 105 // CHECK: gpu.launch_func @launch_from_llvm_func_kernel::@launch_from_llvm_func_kernel 106 // CHECK-SAME: (%[[DIM]], %[[DIM]], %[[DIM]]) 107 // CHECK-SAME: (%[[DIM]], %[[DIM]], %[[DIM]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>) 108 // CHECK-NEXT: llvm.return 109 110 // CHECK: gpu.func {{.*}} kernel attributes 111 // CHECK-SAME: known_block_size = array<i32: 1, 1, 1> 112 // CHECK-SAME: known_grid_size = array<i32: 1, 1, 1> 113 // CHECK: gpu.return 114 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %dim, %grid_y = %dim, 115 %grid_z = %dim) 116 threads(%tx, %ty, %tz) in (%block_x = %dim, %block_y = %dim, 117 %block_z = %dim) { 118 "use"(%0): (f32) -> () 119 "some_op"(%bx, %block_x) : (index, index) -> () 120 %2 = memref.load %1[%tx] : memref<?xf32, 1> 121 gpu.terminator 122 } 123 llvm.return 124} 125 126// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 127 128// ----- 129 130// CHECK: module attributes {gpu.container_module} 131// CHECK-LABEL: @multiple_launches 132func.func @multiple_launches() { 133 // CHECK: %[[CST:.*]] = arith.constant 8 : index 134 %cst = arith.constant 8 : index 135 // CHECK: gpu.launch_func @multiple_launches_kernel::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) 136 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, 137 %grid_z = %cst) 138 threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, 139 %block_z = %cst) { 140 gpu.terminator 141 } 142 // CHECK: gpu.launch_func @multiple_launches_kernel_0::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) 143 gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, 144 %grid_z2 = %cst) 145 threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, 146 %block_z2 = %cst) { 147 gpu.terminator 148 } 149 150 // With async and async deps. 151 // CHECK: %[[TOKEN:.*]] = gpu.wait async 152 // CHECK: gpu.launch_func async [%[[TOKEN]]] @multiple_launches_kernel_1::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) 153 %t = gpu.wait async 154 %u = gpu.launch async [%t] blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, 155 %grid_z2 = %cst) 156 threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, 157 %block_z2 = %cst) { 158 gpu.terminator 159 } 160 161 // CHECK: gpu.launch_func async @multiple_launches_kernel_2::@multiple_launches_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) 162 %v = gpu.launch async blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst, 163 %grid_z2 = %cst) 164 threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst, 165 %block_z2 = %cst) { 166 gpu.terminator 167 } 168 169 return 170} 171 172// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 173// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 174 175// CHECK: gpu.module @multiple_launches_kernel 176// CHECK: func @multiple_launches_kernel 177// CHECK: module @multiple_launches_kernel_0 178// CHECK: func @multiple_launches_kernel 179 180// ----- 181 182// CHECK-LABEL: @extra_constants_not_inlined 183func.func @extra_constants_not_inlined(%arg0: memref<?xf32>) { 184 // CHECK: %[[CST:.*]] = arith.constant 8 : index 185 %cst = arith.constant 8 : index 186 %cst2 = arith.constant 2 : index 187 %c0 = arith.constant 0 : index 188 %cst3 = "secret_constant"() : () -> index 189 // CHECK: gpu.launch_func @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args({{.*}} : memref<?xf32>, {{.*}} : index) 190 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, 191 %grid_z = %cst) 192 threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, 193 %block_z = %cst) { 194 "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> () 195 gpu.terminator 196 } 197 return 198} 199 200// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 201 202// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index) 203// CHECK: arith.constant 2 204 205// ----- 206 207// CHECK-LABEL: @extra_constants 208// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32> 209func.func @extra_constants(%arg0: memref<?xf32>) { 210 // CHECK: %[[CST:.*]] = arith.constant 8 : index 211 %cst = arith.constant 8 : index 212 %cst2 = arith.constant 2 : index 213 %c0 = arith.constant 0 : index 214 %cst3 = memref.dim %arg0, %c0 : memref<?xf32> 215 // CHECK: gpu.launch_func @extra_constants_kernel::@extra_constants_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args(%[[ARG0]] : memref<?xf32>) 216 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, 217 %grid_z = %cst) 218 threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, 219 %block_z = %cst) { 220 "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> () 221 gpu.terminator 222 } 223 return 224} 225 226// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 227 228// CHECK-LABEL: func @extra_constants_kernel( 229// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32> 230// CHECK: arith.constant 2 231// CHECK: arith.constant 0 232// CHECK: memref.dim %[[KARG0]] 233 234// ----- 235 236// CHECK-LABEL: @extra_constants_noarg 237// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32> 238func.func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) { 239 // CHECK: %[[CST:.*]] = arith.constant 8 : index 240 %cst = arith.constant 8 : index 241 %cst2 = arith.constant 2 : index 242 %c0 = arith.constant 0 : index 243 // CHECK: memref.dim %[[ARG1]] 244 %cst3 = memref.dim %arg1, %c0 : memref<?xf32> 245 // CHECK: gpu.launch_func @extra_constants_noarg_kernel::@extra_constants_noarg_kernel blocks in (%[[CST]], %[[CST]], %[[CST]]) threads in (%[[CST]], %[[CST]], %[[CST]]) args(%[[ARG0]] : memref<?xf32>, {{.*}} : index) 246 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, 247 %grid_z = %cst) 248 threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, 249 %block_z = %cst) { 250 "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> () 251 gpu.terminator 252 } 253 return 254} 255 256// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 257 258// CHECK-LABEL: func @extra_constants_noarg_kernel( 259// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index 260// CHECK: %[[KCST:.*]] = arith.constant 2 261// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]]) 262 263// ----- 264 265// CHECK-LABEL: @multiple_uses 266func.func @multiple_uses(%arg0 : memref<?xf32>) { 267 %c1 = arith.constant 1 : index 268 %c2 = arith.constant 2 : index 269 // CHECK: gpu.func {{.*}} { 270 // CHECK: %[[C2:.*]] = arith.constant 2 : index 271 // CHECK: "use1"(%[[C2]], %[[C2]]) 272 // CHECK: "use2"(%[[C2]]) 273 // CHECK: gpu.return 274 // CHECK: } 275 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, 276 %grid_z = %c1) 277 threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, 278 %block_z = %c1) { 279 "use1"(%c2, %c2) : (index, index) -> () 280 "use2"(%c2) : (index) -> () 281 gpu.terminator 282 } 283 return 284} 285 286// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 287 288// ----- 289 290// CHECK-LABEL: @multiple_uses2 291func.func @multiple_uses2(%arg0 : memref<*xf32>) { 292 %c1 = arith.constant 1 : index 293 %c2 = arith.constant 2 : index 294 %d = memref.dim %arg0, %c2 : memref<*xf32> 295 // CHECK: gpu.func {{.*}} { 296 // CHECK: %[[C2:.*]] = arith.constant 2 : index 297 // CHECK: %[[D:.*]] = memref.dim %[[ARG:.*]], %[[C2]] 298 // CHECK: "use1"(%[[D]]) 299 // CHECK: "use2"(%[[C2]], %[[C2]]) 300 // CHECK: "use3"(%[[ARG]]) 301 // CHECK: gpu.return 302 // CHECK: } 303 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, 304 %grid_z = %c1) 305 threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, 306 %block_z = %c1) { 307 "use1"(%d) : (index) -> () 308 "use2"(%c2, %c2) : (index, index) -> () 309 "use3"(%arg0) : (memref<*xf32>) -> () 310 gpu.terminator 311 } 312 return 313} 314 315// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 316 317// ----- 318 319llvm.mlir.global internal @global(42 : i64) : i64 320 321//CHECK-LABEL: @function_call 322func.func @function_call(%arg0 : memref<?xf32>) { 323 %cst = arith.constant 8 : index 324 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, 325 %grid_z = %cst) 326 threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst, 327 %block_z = %cst) { 328 func.call @device_function() : () -> () 329 func.call @device_function() : () -> () 330 %0 = llvm.mlir.addressof @global : !llvm.ptr 331 gpu.terminator 332 } 333 return 334} 335 336func.func @device_function() { 337 call @recursive_device_function() : () -> () 338 return 339} 340 341func.func @recursive_device_function() { 342 call @recursive_device_function() : () -> () 343 return 344} 345 346// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 347 348// CHECK: gpu.module @function_call_kernel { 349// CHECK: gpu.func @function_call_kernel() 350// CHECK: call @device_function() : () -> () 351// CHECK: call @device_function() : () -> () 352// CHECK: llvm.mlir.addressof @global : !llvm.ptr 353// CHECK: gpu.return 354// 355// CHECK: llvm.mlir.global internal @global(42 : i64) {addr_space = 0 : i32} : i64 356// 357// CHECK: func @device_function() 358// CHECK: func @recursive_device_function() 359// CHECK-NOT: func @device_function 360 361// ----- 362 363// CHECK-LABEL: @non_constant_launches 364func.func @non_constant_launches(%arg0 : index) { 365 // CHECK-NOT: known_block_size 366 // CHECK-NOT: known_grid_size 367 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %arg0, %grid_y = %arg0, 368 %grid_z = %arg0) 369 threads(%tx, %ty, %tz) in (%block_x = %arg0, %block_y = %arg0, 370 %block_z = %arg0) { 371 gpu.terminator 372 } 373 return 374} 375 376// CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 377 378// CHECK: module attributes {gpu.container_module} 379 380// ----- 381 382// This test checks memory attributions for gpu.launch, using both workgroup and private attributions. 383// CHECK-LABEL: func @launch_memory_attributions_0() 384func.func @launch_memory_attributions_0() { 385 %1 = "op"() : () -> (memref<?xf32, 1>) 386 %128 = arith.constant 128 : index 387 388 // CHECK: gpu.launch_func @launch_memory_attributions_0_kernel::@launch_memory_attributions_0_kernel 389 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %128, %grid_y = %128, 390 %grid_z = %128) 391 threads(%tx, %ty, %tz) in (%block_x = %128, %block_y = %128, 392 %block_z = %128) 393 workgroup(%shared: memref<42xf32, 3>) 394 private(%priv0: memref<2xf32, 5>, %priv1: memref<1xf32, 5>) { 395 "some_op"(%bx, %block_x) : (index, index) -> () 396 %42 = memref.load %1[%tx] : memref<?xf32, 1> 397 %43 = memref.load %shared[%tx] : memref<42xf32, 3> 398 %44 = memref.load %priv1[%tx] : memref<1xf32, 5> 399 gpu.terminator 400 } 401 return 402} 403 404// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 405 406// CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel 407// CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel 408// CHECK-SAME: workgroup(%[[KERNEL_ARG1:.*]] : memref<42xf32, 3>) 409// CHECK-SAME: private(%[[KERNEL_ARG2:.*]] : memref<2xf32, 5>, %[[KERNEL_ARG3:.*]] : memref<1xf32, 5>) 410// CHECK: %[[TID:.*]] = gpu.thread_id x 411// CHECK: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<42xf32, 3> 412// CHECK-NEXT: = memref.load %[[KERNEL_ARG3]][%[[TID]]] : memref<1xf32, 5> 413 414// ----- 415 416// This test checks correctness of private attributions in the absence of workgroup attributions. 417// CHECK-LABEL: @launch_memory_attributions_1 418func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { 419 %c1 = arith.constant 1 : index 420 %c2 = arith.constant 2 : index 421 %d = memref.dim %arg0, %c2 : memref<*xf32> 422 // CHECK: gpu.func {{.*}} private(%[[KERNEL_ARG:.*]] : memref<3xf32, 5>) {{.*}} { 423 // CHECK: %[[C2:.*]] = arith.constant 2 : index 424 // CHECK: = memref.load %[[KERNEL_ARG]][%[[C2]]] : memref<3xf32, 5> 425 // CHECK: gpu.return 426 // CHECK: } 427 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, 428 %grid_z = %c1) 429 threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, 430 %block_z = %c1) 431 private(%priv0: memref<3xf32, 5>) { 432 %42 = memref.load %priv0[%c2] : memref<3xf32, 5> 433 gpu.terminator 434 } 435 return 436} 437 438// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} 439 440// ----- 441// CHECK: module attributes {gpu.container_module} 442 443// CHECK-LABEL: func @launch_cluster() 444func.func @launch_cluster() { 445 // CHECK: %[[ARG0:.*]] = "op"() : () -> f32 446 %0 = "op"() : () -> (f32) 447 // CHECK: %[[ARG1:.*]] = "op"() : () -> memref<?xf32, 1> 448 %1 = "op"() : () -> (memref<?xf32, 1>) 449 // CHECK: %[[CDIMX:.*]] = arith.constant 1 450 %cDimX = arith.constant 1 : index 451 // CHECK: %[[CDIMY:.*]] = arith.constant 2 452 %cDimY = arith.constant 2 : index 453 // CHECK: %[[CDIMZ:.*]] = arith.constant 1 454 %cDimZ = arith.constant 1 : index 455 // CHECK: %[[GDIMX:.*]] = arith.constant 8 456 %gDimX = arith.constant 8 : index 457 // CHECK: %[[GDIMY:.*]] = arith.constant 12 458 %gDimY = arith.constant 12 : index 459 // CHECK: %[[GDIMZ:.*]] = arith.constant 16 460 %gDimZ = arith.constant 16 : index 461 // CHECK: %[[BDIMX:.*]] = arith.constant 20 462 %bDimX = arith.constant 20 : index 463 // CHECK: %[[BDIMY:.*]] = arith.constant 24 464 %bDimY = arith.constant 24 : index 465 // CHECK: %[[BDIMZ:.*]] = arith.constant 28 466 %bDimZ = arith.constant 28 : index 467 468 // CHECK: gpu.launch_func @launch_cluster_kernel::@launch_cluster_kernel clusters in (%[[CDIMX]], %[[CDIMY]], %[[CDIMZ]]) blocks in (%[[GDIMX]], %[[GDIMY]], %[[GDIMZ]]) threads in (%[[BDIMX]], %[[BDIMY]], %[[BDIMZ]]) args(%[[ARG0]] : f32, %[[ARG1]] : memref<?xf32, 1>) 469 // CHECK-NOT: gpu.launch blocks 470 gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY, 471 %cluster_z = %cDimZ) 472 blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, 473 %grid_z = %gDimZ) 474 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, 475 %block_z = %bDimZ) { 476 "use"(%0): (f32) -> () 477 "some_op"(%cx, %bx, %block_x) : (index, index, index) -> () 478 %42 = memref.load %1[%tx] : memref<?xf32, 1> 479 gpu.terminator 480 } 481 return 482} 483 484// CHECK-LABEL: gpu.module @launch_cluster_kernel 485// CHECK-NEXT: gpu.func @launch_cluster_kernel 486// CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>) 487// CHECK-SAME: known_block_size = array<i32: 20, 24, 28> 488// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16> 489// CHECK-NEXT: %[[BID:.*]] = gpu.block_id x 490// CHECK-NEXT: = gpu.block_id y 491// CHECK-NEXT: = gpu.block_id z 492// CHECK-NEXT: %[[TID:.*]] = gpu.thread_id x 493// CHECK-NEXT: = gpu.thread_id y 494// CHECK-NEXT: = gpu.thread_id z 495// CHECK-NEXT: = gpu.grid_dim x 496// CHECK-NEXT: = gpu.grid_dim y 497// CHECK-NEXT: = gpu.grid_dim z 498// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x 499// CHECK-NEXT: = gpu.block_dim y 500// CHECK-NEXT: = gpu.block_dim z 501// CHECK-NEXT: %[[CID:.*]] = gpu.cluster_id x 502// CHECK-NEXT: = gpu.cluster_id y 503// CHECK-NEXT: = gpu.cluster_id z 504// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x 505// CHECK-NEXT: = gpu.cluster_dim y 506// CHECK-NEXT: = gpu.cluster_dim z 507// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> () 508// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> () 509// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1> 510 511// ----- 512// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch 513// CHECK-LABEL: func.func @testKernelAttributes() 514// CHECK: gpu.launch_func @test_module::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) 515// CHECK: gpu.module @test_module 516// CHECK: gpu.func @test_kernel_func() 517func.func @testKernelAttributes() { 518 %gDimX = arith.constant 8 : index 519 %gDimY = arith.constant 12 : index 520 %gDimZ = arith.constant 16 : index 521 %bDimX = arith.constant 32 : index 522 %bDimY = arith.constant 16 : index 523 %bDimZ = arith.constant 8 : index 524 525 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) 526 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { 527 "some_op"(%bx, %tx) : (index, index) -> () 528 gpu.terminator 529 } {kernelModule = @test_module, kernelFunc = @test_kernel_func} 530 return 531} 532 533// ----- 534// This test tests the two optional attributes kernelModule and kernelFunc for gpu.launch, when kernelModule already exists. 535 536// CHECK-LABEL: gpu.module @existing_module 537// CHECK: gpu.func @test_kernel_func() 538// CHECK: gpu.func @test_kernel_func_0() 539// CHECK-NOT: gpu.module @testExistingModule_kernel 540// CHECK-NOT: gpu.func @testExistingModule_kernel() 541// CHECK: func.func @testExistingModule() 542// CHECK: gpu.launch_func @existing_module::@test_kernel_func_0 blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) 543 544gpu.module @existing_module { 545 gpu.func @test_kernel_func() { 546 gpu.return 547 } 548} 549 550func.func @testExistingModule() { 551 %gDimX = arith.constant 8 : index 552 %gDimY = arith.constant 12 : index 553 %gDimZ = arith.constant 16 : index 554 %bDimX = arith.constant 32 : index 555 %bDimY = arith.constant 16 : index 556 %bDimZ = arith.constant 8 : index 557 558 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) 559 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { 560 "some_op"(%bx, %tx) : (index, index) -> () 561 gpu.terminator 562 } {kernelModule = @existing_module, kernelFunc = @test_kernel_func} 563 return 564} 565 566// ----- 567// This test tests the optional attribute kernelModule for gpu.launch. 568// CHECK-LABEL: func.func @testKernelModuleOnly() 569// CHECK: gpu.launch_func @test_module::@testKernelModuleOnly_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) 570// CHECK: gpu.module @test_module 571// CHECK: gpu.func @testKernelModuleOnly_kernel() 572func.func @testKernelModuleOnly() { 573 %gDimX = arith.constant 8 : index 574 %gDimY = arith.constant 12 : index 575 %gDimZ = arith.constant 16 : index 576 %bDimX = arith.constant 32 : index 577 %bDimY = arith.constant 16 : index 578 %bDimZ = arith.constant 8 : index 579 580 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) 581 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { 582 "some_op"(%bx, %tx) : (index, index) -> () 583 gpu.terminator 584 } {kernelModule = @test_module} 585 return 586} 587 588// ----- 589// This test tests the optional attribute kernelFunc for gpu.launch. 590// CHECK-LABEL: func.func @testKernelFuncOnly() 591// CHECK: gpu.launch_func @test_kernel_func::@test_kernel_func blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) 592 593// CHECK: gpu.module @test_kernel_func 594// CHECK: gpu.func @test_kernel_func() 595func.func @testKernelFuncOnly() { 596 %gDimX = arith.constant 8 : index 597 %gDimY = arith.constant 12 : index 598 %gDimZ = arith.constant 16 : index 599 %bDimX = arith.constant 32 : index 600 %bDimY = arith.constant 16 : index 601 %bDimZ = arith.constant 8 : index 602 603 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) 604 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { 605 "some_op"(%bx, %tx) : (index, index) -> () 606 gpu.terminator 607 } {kernelFunc = @test_kernel_func} 608 return 609} 610 611// ----- 612// This test tests gpu.launch when optional attributes kernelModule and kernelFunc are not specified. 613// CHECK-LABEL: func.func @testNoAttributes() 614// CHECK: gpu.launch_func @testNoAttributes_kernel::@testNoAttributes_kernel blocks in (%[[GRID_X:.*]], %[[GRID_Y:.*]], %[[GRID_Z:.*]]) threads in (%[[BLOCK_X:.*]], %[[BLOCK_Y:.*]], %[[BLOCK_Z:.*]]) 615 616// CHECK: gpu.module @testNoAttributes_kernel 617// CHECK: gpu.func @testNoAttributes_kernel() 618func.func @testNoAttributes() { 619 %gDimX = arith.constant 8 : index 620 %gDimY = arith.constant 12 : index 621 %gDimZ = arith.constant 16 : index 622 %bDimX = arith.constant 32 : index 623 %bDimY = arith.constant 16 : index 624 %bDimZ = arith.constant 8 : index 625 626 gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY, %grid_z = %gDimZ) 627 threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY, %block_z = %bDimZ) { 628 "some_op"(%bx, %tx) : (index, index) -> () 629 gpu.terminator 630 } 631 return 632} 633