1// RUN: mlir-opt --transform-interpreter --split-input-file -canonicalize -cse %s | FileCheck %s 2 3!type = memref<2 x 32 x f32> 4!type1d = memref<32 x f32> 5 6// CHECK-LABEL: func.func @blocks_3d( 7// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 8// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 9// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 10func.func @blocks_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 11 %c9 = arith.constant 9 : index 12 %c7 = arith.constant 7 : index 13 %one = arith.constant 1 : index 14// CHECK: gpu.launch 15// CHECK: %[[BLKX:.*]] = gpu.block_id x 16// CHECK: %[[BLKY:.*]] = gpu.block_id y 17// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]]] 18// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]]] 19 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 20 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 21 { 22 scf.forall (%i, %j) in (%c7, %c9) { 23 %4 = memref.load %x[%i, %j] : !type 24 %5 = memref.load %y[%i, %j] : !type 25 %6 = math.fma %alpha, %4, %5 : f32 26 memref.store %6, %y[%i, %j] : !type 27 } { mapping = [#gpu.block<x>, #gpu.block<y>]} 28 gpu.terminator 29 } 30 return %y : !type 31} 32 33module attributes {transform.with_named_sequence} { 34 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 35 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 36 transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op 37 transform.yield 38 } 39} 40 41// ----- 42 43!type = memref<2 x 32 x f32> 44!type1d = memref<32 x f32> 45 46// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 floordiv 128)> 47 48// CHECK-LABEL: func.func @warpgroup_3d( 49// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 50// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 51// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 52func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 53 %c1 = arith.constant 1 : index 54 %c3 = arith.constant 3 : index 55 %one = arith.constant 1 : index 56 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 57 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 58 // CHECK-DAG: %[[C384:.*]] = arith.constant 384 : index 59 // CHECK-DAG: %[[C512:.*]] = arith.constant 512 : index 60 61// CHECK: gpu.launch 62// CHECK: %[[TIDX:.*]] = gpu.thread_id x 63// CHECK: %[[TIDY:.*]] = gpu.thread_id y 64// CHECK-DAG: %[[WG:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]] 65// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C384]] : index 66// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index 67// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 68// CHECK: scf.if %[[COND]] 69// CHECK: memref.load %[[ARGX]][%[[WG]], %[[TIDY]]] 70// CHECK: memref.load %[[ARGY]][%[[WG]], %[[TIDY]]] 71 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 72 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 73 { 74 scf.forall (%i, %j) in (%c3, %c1) { 75 %4 = memref.load %x[%i, %j] : !type 76 %5 = memref.load %y[%i, %j] : !type 77 %6 = math.fma %alpha, %4, %5 : f32 78 memref.store %6, %y[%i, %j] : !type 79 } { mapping = [#gpu.warpgroup<x>, #gpu.warpgroup<y>]} 80 gpu.terminator 81 } 82 return %y : !type 83} 84 85module attributes {transform.with_named_sequence} { 86 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 87 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 88 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [512, 2, 1] : (!transform.any_op) -> !transform.any_op 89 transform.yield 90 } 91} 92 93// ----- 94 95!type = memref<2 x 32 x f32> 96!type1d = memref<32 x f32> 97 98// CHECK-DAG: #map = affine_map<()[s0] -> (s0 floordiv 16)> 99 100// CHECK-LABEL: func.func @warp_3d( 101// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 102// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 103// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 104func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 105 %c2 = arith.constant 2 : index 106 %c3 = arith.constant 3 : index 107 %one = arith.constant 1 : index 108 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 109 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 110 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index 111 // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index 112 // CHECK-DAG: %[[c64:.*]] = arith.constant 64 : index 113 114// CHECK: gpu.launch 115// CHECK: %[[TIDX:.*]] = gpu.thread_id x 116// CHECK: %[[TIDY:.*]] = gpu.thread_id y 117// CHECK-DAG: %[[W:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]] 118// CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index 119// CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index 120// CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 121// CHECK: scf.if %[[COND]] 122// CHECK: memref.load %[[ARGX]][%[[W]], %[[TIDY]]] 123// CHECK: memref.load %[[ARGY]][%[[W]], %[[TIDY]]] 124 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 125 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 126 { 127 scf.forall (%i, %j, %k) in (%c2, %c3, %c3) { 128 %4 = memref.load %x[%i, %j] : !type 129 %5 = memref.load %y[%i, %j] : !type 130 %6 = math.fma %alpha, %4, %5 : f32 131 memref.store %6, %y[%i, %j] : !type 132 } { mapping = [#gpu.warp<x>, #gpu.warp<y>, #gpu.warp<z>]} 133 gpu.terminator 134 } 135 return %y : !type 136} 137 138module attributes {transform.with_named_sequence} { 139 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 140 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 141 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [64, 4, 3] warp_size = 16: (!transform.any_op) -> !transform.any_op 142 transform.yield 143 } 144} 145 146// ----- 147 148!type = memref<2 x 32 x f32> 149!type1d = memref<32 x f32> 150 151// CHECK-LABEL: func.func @threads_3d( 152// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 153// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 154// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 155func.func @threads_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 156 %one = arith.constant 1 : index 157 %c12 = arith.constant 12 : index 158 %c9 = arith.constant 9 : index 159 %c7 = arith.constant 7 : index 160// CHECK: %[[C1:.*]] = arith.constant 1 : index 161// CHECK: %[[C12:.*]] = arith.constant 12 : index 162// CHECK: %[[C9:.*]] = arith.constant 9 : index 163// CHECK: %[[C7:.*]] = arith.constant 7 : index 164// CHECK: gpu.launch async [%{{.*}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) 165// CHECK: %[[TIDX:.*]] = gpu.thread_id x 166// CHECK: %[[TIDY:.*]] = gpu.thread_id y 167// CHECK: arith.cmpi ult, %[[TIDX]], %[[C9]] : index 168// CHECK: arith.cmpi ult, %[[TIDY]], %[[C7]] : index 169// CHECK: memref.load %[[ARGX]][%[[TIDY]], %[[TIDX]]] 170// CHECK: memref.load %[[ARGY]][%[[TIDY]], %[[TIDX]]] 171// CHECK: gpu.barrier 172// CHECK: arith.cmpi ult, %[[TIDY]], %[[C1]] : index 173// CHECK: memref.load %[[ARGT]][%[[TIDX]]] 174// CHECK: gpu.barrier 175 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 176 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 177 { 178 scf.forall (%i, %j) in (%c7, %c9) { 179 %4 = memref.load %x[%i, %j] : !type 180 %5 = memref.load %y[%i, %j] : !type 181 %6 = math.fma %alpha, %4, %5 : f32 182 memref.store %6, %y[%i, %j] : !type 183 } { mapping = [#gpu.thread<y>, #gpu.thread<x>]} 184 scf.forall (%i) in (%c12) { 185 %7 = memref.load %t[%i] : !type1d 186 %8 = arith.addf %alpha, %7 : f32 187 memref.store %8, %t[%i] : !type1d 188 } {mapping = [#gpu.thread<x>] } 189 gpu.terminator 190 } 191 return %y : !type 192} 193 194module attributes {transform.with_named_sequence} { 195 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 196 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 197 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op 198 transform.yield 199 } 200} 201 202// ----- 203 204!type4d = memref<32x64x4x32xf32> 205 206// CHECK-LABEL: func.func @saxpy4d( 207// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32> 208// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32> 209func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d { 210 %c32 = arith.constant 32 : index 211 %c64 = arith.constant 64 : index 212 %c4 = arith.constant 4 : index 213// CHECK: %[[C32:.*]] = arith.constant 32 : index 214// CHECK: %[[C64:.*]] = arith.constant 64 : index 215// CHECK: %[[C4:.*]] = arith.constant 4 : index 216// CHECK: %[[C1:.*]] = arith.constant 1 : index 217// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]]) 218// CHECK: %[[BLKX:.*]] = gpu.block_id x 219// CHECK: %[[BLKY:.*]] = gpu.block_id y 220// CHECK: %[[TIDX:.*]] = gpu.thread_id x 221// CHECK: %[[TIDY:.*]] = gpu.thread_id y 222// CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] 223// CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] 224 scf.forall (%i, %j) in (%c32, %c64) { 225 scf.forall (%k, %l) in (%c4, %c32) { 226 %4 = memref.load %x[%i, %j, %k, %l] : !type4d 227 %5 = memref.load %y[%i, %j, %k, %l] : !type4d 228 %6 = math.fma %alpha, %4, %5 : f32 229 memref.store %6, %y[%i, %j, %k, %l] : !type4d 230 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 231 } { mapping = [#gpu.block<x>, #gpu.block<y>] } 232 return %y : !type4d 233} 234 235module attributes {transform.with_named_sequence} { 236 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 237 %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op 238 %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op 239 transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1] : (!transform.any_op) -> !transform.any_op 240 transform.yield 241 } 242} 243 244// ----- 245 246!type = memref<2 x 32 x f32> 247!type1d = memref<32 x f32> 248 249// CHECK-LABEL: func.func @saxpy2d_no_barrier( 250func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 251 %one = arith.constant 1 : index 252 %c12 = arith.constant 12 : index 253 %c9 = arith.constant 9 : index 254 %c7 = arith.constant 7 : index 255// CHECK-NOT: gpu.barrier 256// CHECK: return 257 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 258 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 259 { 260 scf.forall (%i, %j) in (%c7, %c9) { 261 %4 = memref.load %x[%i, %j] : !type 262 %5 = memref.load %y[%i, %j] : !type 263 %6 = math.fma %alpha, %4, %5 : f32 264 memref.store %6, %y[%i, %j] : !type 265 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 266 gpu.terminator 267 } 268 return %y : !type 269} 270 271module attributes {transform.with_named_sequence} { 272 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 273 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 274 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false : (!transform.any_op) -> !transform.any_op 275 transform.yield 276 } 277} 278 279// ----- 280 281!type = memref<32x32xf32> 282// CHECK-LABEL: func.func @saxpy2d_singleloop( 283// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x32xf32> 284// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x32xf32> 285func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { 286 %c32 = arith.constant 32 : index 287 %one = arith.constant 1 : index 288 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 289 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 290 { 291// CHECK: %[[TIDX:.*]] = gpu.thread_id x 292// CHECK: memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]] 293// CHECK: memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]] 294 scf.forall (%i) in (%c32) { 295 %4 = memref.load %x[%i, %i] : !type 296 %5 = memref.load %y[%i, %i] : !type 297 %6 = arith.mulf %4, %5 : f32 298 memref.store %6, %y[%i, %i] : !type 299 } { mapping = [#gpu.thread<x>] } 300 gpu.terminator 301 } 302 return %y : !type 303} 304 305module attributes {transform.with_named_sequence} { 306 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 307 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 308 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 1, 1] : (!transform.any_op) -> !transform.any_op 309 transform.yield 310 } 311} 312 313// ----- 314 315!type = memref<3 x 2 x 32 x f32> 316!type1d = memref<32 x f32> 317 318// CHECK-LABEL: func.func @saxpy3d_fold_id_z( 319func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 320 %one = arith.constant 1 : index 321 %c12 = arith.constant 12 : index 322 %c9 = arith.constant 9 : index 323 %c7 = arith.constant 7 : index 324// CHECK: %[[C0:.+]] = arith.constant 0 : index 325// CHECK-NOT: gpu.thread_id z 326 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 327 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 328 { 329 scf.forall (%i, %j, %k) in (%one, %c7, %c9) { 330// CHECK: memref.load %{{.*}}[%[[C0]], 331// CHECK: memref.load %{{.*}}[%[[C0]], 332 %4 = memref.load %x[%i, %j, %k] : !type 333 %5 = memref.load %y[%i, %j, %k] : !type 334 %6 = math.fma %alpha, %4, %5 : f32 335// CHECK: memref.store %{{.*}}, %{{.*}}[%[[C0]] 336 memref.store %6, %y[%i, %j, %k] : !type 337 } { mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] } 338 gpu.terminator 339 } 340 return %y : !type 341} 342 343module attributes {transform.with_named_sequence} { 344 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 345 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 346 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false : (!transform.any_op) -> !transform.any_op 347 transform.yield 348 } 349} 350 351 352// ----- 353 354!type = memref<2 x 32 x f32> 355!type1d = memref<32 x f32> 356 357// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)> 358// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 32) floordiv 128) mod 2)> 359// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<()[s0, s1, s2] -> (s2 + ((s0 + s1 * 32) floordiv 128) floordiv 2)> 360 361// CHECK-LABEL: func.func @warpgroup_linear( 362// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 363// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 364// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 365func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 366 %c2 = arith.constant 2 : index 367 %c3 = arith.constant 3 : index 368 %one = arith.constant 1 : index 369 370// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 371// CHECK-DAG: %[[C768:.*]] = arith.constant 768 : index 372// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index 373// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index 374// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index 375 376// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x 377// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y 378// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z 379// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 380// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]]()[%[[TIDX]], %[[TIDY]]] 381// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 382// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C768]] : index 383// CHECK: scf.if %[[CMPLIN]] 384// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]] 385// CHECK: memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]] 386 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 387 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 388 { 389 scf.forall (%i, %j) in (%c2, %c3) { 390 %4 = memref.load %x[%i, %j] : !type 391 %5 = memref.load %y[%i, %j] : !type 392 %6 = math.fma %alpha, %4, %5 : f32 393 memref.store %6, %y[%i, %j] : !type 394 } { mapping = [#gpu.warpgroup<linear_dim_0>, #gpu.warpgroup<linear_dim_1>]} 395 gpu.terminator 396 } 397 return %y : !type 398} 399 400module attributes {transform.with_named_sequence} { 401 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 402 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 403 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op 404 transform.yield 405 } 406} 407 408// ----- 409 410!type = memref<2 x 32 x f32> 411!type1d = memref<32 x f32> 412 413// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)> 414// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) mod 2)> 415// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) floordiv 2)> 416 417// CHECK-LABEL: func.func @warp_linear( 418// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 419// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 420// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 421func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 422 %c2 = arith.constant 2 : index 423 %c3 = arith.constant 3 : index 424 %one = arith.constant 1 : index 425 426// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 427// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index 428// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index 429// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index 430// CHECK-DAG: %[[C192:.*]] = arith.constant 192 : index 431 432// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x 433// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y 434// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z 435// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 436// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 437// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 438// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C192]] : index 439// CHECK: scf.if %[[CMPLIN]] 440// CHECK: memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]] 441// CHECK: memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]] 442 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 443 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 444 { 445 scf.forall (%i, %j) in (%c2, %c3) { 446 %4 = memref.load %x[%i, %j] : !type 447 %5 = memref.load %y[%i, %j] : !type 448 %6 = math.fma %alpha, %4, %5 : f32 449 memref.store %6, %y[%i, %j] : !type 450 } { mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>]} 451 gpu.terminator 452 } 453 return %y : !type 454} 455 456module attributes {transform.with_named_sequence} { 457 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 458 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 459 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op 460 transform.yield 461 } 462} 463 464// ----- 465 466!type = memref<2 x 32 x f32> 467!type1d = memref<32 x f32> 468 469// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 18) floordiv 32) mod 3)> 470// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 18) floordiv 32) mod 6) floordiv 3)> 471 472// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 18)> 473// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) mod 10)> 474// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) floordiv 10)> 475 476// CHECK-LABEL: func.func @map_multi_level_linear( 477func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 478 %one = arith.constant 1 : index 479 %c10 = arith.constant 10 : index 480 %c9 = arith.constant 9 : index 481 %c7 = arith.constant 7 : index 482 %c1 = arith.constant 1 : index 483 %c2 = arith.constant 2 : index 484 %c3 = arith.constant 3 : index 485 486 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 487 // CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index 488 // CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index 489 // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index 490 // CHECK-DAG: %[[C192:.*]] = arith.constant 192 : index 491 492 // check that both the thread level and the warp level got distributed. 493 // CHECK-NOT: #gpu.thread 494 // CHECK-NOT: #gpu.warp 495 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 496 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 497 { 498 // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x 499 // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y 500 scf.forall (%i, %j) in (%c7, %c9) { 501 %4 = memref.load %x[%i, %j] : !type 502 %5 = memref.load %y[%i, %j] : !type 503 %6 = math.fma %alpha, %4, %5 : f32 504 memref.store %6, %y[%i, %j] : !type 505 } { mapping = [#gpu.thread<y>, #gpu.thread<x>]} 506 507 // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]]()[%[[TIDX]], %[[TIDY]]] 508 // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]]] 509 // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]]] 510 // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[LIN]], %[[C192]] : index 511 // CHECK: scf.if %[[CMPLIN]] 512 scf.forall (%i, %j, %k) in (%c3, %c2, %c1) { 513 %7 = memref.load %x[%i, %j] : !type 514 %8 = arith.addf %alpha, %7 : f32 515 memref.store %8, %y[%i, %j] : !type 516 } {mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_2>] } 517 518 // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]]()[%[[TIDX]], %[[TIDY]]] 519 // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]]()[%[[TIDX]], %[[TIDY]]] 520 // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index 521 // CHECK: scf.if %[[COND]] 522 // CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32> 523 // CHECK: memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32> 524 scf.forall (%i, %j) in (%c10, %c2) { 525 %7 = memref.load %t[%i] : !type1d 526 %8 = arith.addf %alpha, %7 : f32 527 memref.store %8, %t[%j] : !type1d 528 } {mapping = [#gpu.thread<linear_dim_0>, #gpu.thread<linear_dim_1>] } 529 gpu.terminator 530 } 531 return %y : !type 532} 533 534module attributes {transform.with_named_sequence} { 535 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 536 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 537 transform.gpu.map_nested_forall_to_threads %funcop 538 block_dims = [18, 11, 1] : (!transform.any_op) -> !transform.any_op 539 transform.yield 540 } 541} 542 543// ----- 544 545!type = memref<2 x 32 x f32> 546!type1d = memref<32 x f32> 547 548// CHECK-DAG: #[[$MAPBLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 12 + s2 * 108)> 549// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) mod 7)> 550// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) floordiv 7)> 551 552// CHECK-LABEL: func.func @block_linear_existing_launch( 553// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 554// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 555// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 556func.func @block_linear_existing_launch( 557 %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 558 %c9 = arith.constant 9 : index 559 %c7 = arith.constant 7 : index 560 %one = arith.constant 1 : index 561 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 562 // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index 563 // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index 564 // CHECK-DAG: %[[C63:.*]] = arith.constant 63 : index 565// CHECK: gpu.launch async [{{.*}}] blocks({{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads 566// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x 567// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y 568// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z 569// CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] 570// CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] 571// CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] 572// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[BIDLIN]], %[[C63]] : index 573// CHECK: scf.if %[[CMPLIN]] 574// CHECK: memref.load %[[ARGX]][%[[BLX]], %[[BLY]]] 575// CHECK: memref.load %[[ARGY]][%[[BLX]], %[[BLY]]] 576 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 577 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 578 { 579 scf.forall (%i, %j) in (%c7, %c9) { 580 %4 = memref.load %x[%i, %j] : !type 581 %5 = memref.load %y[%i, %j] : !type 582 %6 = math.fma %alpha, %4, %5 : f32 583 memref.store %6, %y[%i, %j] : !type 584 } { mapping = [#gpu.block<linear_dim_0>, #gpu.block<linear_dim_1>]} 585 gpu.terminator 586 } 587 return %y : !type 588} 589 590module attributes {transform.with_named_sequence} { 591 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 592 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 593 transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op 594 transform.yield 595 } 596} 597 598// ----- 599 600!type = memref<2 x 32 x f32> 601!type1d = memref<32 x f32> 602 603// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0] -> (s0 mod 7)> 604// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * 9 + s0 floordiv 7)> 605 606// CHECK-LABEL: func.func @block_linear_generate_launch( 607// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<2x32xf32> 608// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<2x32xf32> 609// CHECK-SAME: %[[ARGT:[0-9a-z]+]]: memref<32xf32> 610func.func @block_linear_generate_launch( 611 %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { 612 %c9 = arith.constant 9 : index 613 %c7 = arith.constant 7 : index 614 %one = arith.constant 1 : index 615 616 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 617 // CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index 618 // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index 619// CHECK: gpu.launch blocks({{.*}}) in (%{{.*}} = %[[C7]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads 620// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x 621// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y 622// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z 623// CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]]] 624// CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] 625// CHECK: memref.load %[[ARGX]][%[[BLX]], %[[BLY]]] 626// CHECK: memref.load %[[ARGY]][%[[BLX]], %[[BLY]]] 627 scf.forall (%i, %j) in (%c7, %c9) { 628 %4 = memref.load %x[%i, %j] : !type 629 %5 = memref.load %y[%i, %j] : !type 630 %6 = math.fma %alpha, %4, %5 : f32 631 memref.store %6, %y[%i, %j] : !type 632 } { mapping = [#gpu.block<linear_dim_0>, #gpu.block<linear_dim_1>]} 633 634 return %y : !type 635} 636 637module attributes {transform.with_named_sequence} { 638 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 639 %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op 640 transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch : (!transform.any_op) -> !transform.any_op 641 transform.yield 642 } 643} 644 645// ----- 646 647#map = affine_map<(d0) -> (d0 * 128)> 648#map1 = affine_map<(d0) -> (d0 * 32)> 649 650// CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)> 651// CHECK-DAG: #[[$MAPW:.*]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)> 652 653// CHECK-LABEL: func.func @simple_fill( 654func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> { 655 %c0 = arith.constant 0 : index 656 %cst = arith.constant dense<0.000000e+00> : vector<32xf32> 657// CHECK: %[[C1:.*]] = arith.constant 1 : index 658// CHECK: %[[C4:.*]] = arith.constant 4 : index 659// CHECK: %[[C8:.*]] = arith.constant 8 : index 660// CHECK: gpu.launch 661 scf.forall (%arg1) in (1) { 662// CHECK: %[[BIDX:.*]] = gpu.block_id x 663// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] 664 %0 = affine.apply #map(%arg1) 665 %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>> 666 scf.forall (%arg2) in (4) { 667// CHECK: %[[TIDX:.*]] = gpu.thread_id x 668// CHECK: %[[TIDY:.*]] = gpu.thread_id y 669// CHECK: %[[TIDZ:.*]] = gpu.thread_id z 670// CHECK: %[[THX:.*]] = affine.apply #[[$MAPW]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] 671// CHECK-NOT: scf.if 672// CHECK: memref.subview %{{.*}}[%[[THX]]] 673 %1 = affine.apply #map1(%arg2) 674 %subview_0 = memref.subview %subview[%1] [32] [1] : memref<128xf32, strided<[1], offset: ?>> to memref<32xf32, strided<[1], offset: ?>> 675 vector.transfer_write %cst, %subview_0[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, strided<[1], offset: ?>> 676 memref.copy %subview_0, %subview_0 : memref<32xf32, strided<[1], offset: ?>> to memref<32xf32, strided<[1], offset: ?>> 677 } {mapping = [#gpu.warp<linear_dim_0>]} 678 memref.copy %subview, %subview : memref<128xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>> 679 } {mapping = [#gpu.block<x>]} 680 return %arg0 : memref<128xf32> 681} 682 683module attributes {transform.with_named_sequence} { 684 transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { 685 %func = transform.structured.match ops{["func.func"]} in %module_op 686 : (!transform.any_op) -> !transform.any_op 687 %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch 688 : (!transform.any_op) -> !transform.any_op 689 transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [4, 8, 4] 690 : (!transform.any_op) -> !transform.any_op 691 transform.yield 692 } 693} 694