1// RUN: mlir-opt --transform-interpreter --split-input-file -canonicalize -cse --verify-diagnostics %s 2 3func.func @map_nested_forall_to_threads_not_gpu_launch() -> () { 4 %1 = tensor.empty() : tensor<4xf32> 5 return 6} 7module attributes {transform.with_named_sequence} { 8 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 9 %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!transform.any_op) -> !transform.any_op 10 // expected-error @below {{Given target is not a gpu.launch}} 11 %1 = transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] : (!transform.any_op) -> !transform.any_op 12 transform.yield 13 } 14} 15 16// ----- 17 18func.func @map_nested_forall_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 19 %one = arith.constant 1 : index 20 %c900 = arith.constant 900 : index 21 %c9 = arith.constant 9 : index 22 %c7 = arith.constant 7 : index 23 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 24 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 25 { 26 scf.forall (%i, %j) in (%c7, %c900) { 27 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 28 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 29 %6 = math.fma %alpha, %4, %5 : f32 30 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 31 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 32 gpu.terminator 33 } 34 35 %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 36 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 37 { 38 scf.forall (%i, %j) in (%c7, %c9) { 39 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 40 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 41 %6 = math.fma %alpha, %4, %5 : f32 42 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 43 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 44 gpu.terminator 45 } 46 47 return %y : memref<2 x 32 x f32> 48} 49 50module attributes {transform.with_named_sequence} { 51 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 52 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 53 // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (1, 1, 1) block_dims = (1200, 9, 1). It is larger than the limits.}} 54 // expected-note @below {{"block_dims" is too large}} 55 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1200, 9, 1] : (!transform.any_op) -> !transform.any_op 56 transform.yield 57 } 58} 59 60// ----- 61 62func.func @map_nested_forall_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 63 %one = arith.constant 1 : index 64 %c900 = arith.constant 900 : index 65 %c9 = arith.constant 9 : index 66 %c7 = arith.constant 7 : index 67 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 68 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 69 { 70 scf.forall (%i, %j) in (%c7, %c900) { 71 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 72 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 73 %6 = math.fma %alpha, %4, %5 : f32 74 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 75 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 76 gpu.terminator 77 } 78 79 %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 80 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 81 { 82 scf.forall (%i, %j) in (%c7, %c9) { 83 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 84 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 85 %6 = math.fma %alpha, %4, %5 : f32 86 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 87 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 88 gpu.terminator 89 } 90 91 return %y : memref<2 x 32 x f32> 92} 93 94module attributes {transform.with_named_sequence} { 95 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 96 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 97 // expected-error @below {{the number of required parallel resources (blocks or threads) 6300 overflows the number of available resources 512}} 98 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op 99 transform.yield 100 } 101} 102 103// ----- 104 105func.func @map_nested_forall_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> { 106 %one = arith.constant 1 : index 107 %c900 = arith.constant 900 : index 108 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 109 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 110 { 111 scf.forall (%i, %j) in (%c7, %c900) { 112 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 113 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 114 %6 = math.fma %alpha, %4, %5 : f32 115 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 116 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 117 gpu.terminator 118 } 119 return %y : memref<2 x 32 x f32> 120} 121 122module attributes {transform.with_named_sequence} { 123 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 124 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 125 // expected-error @below {{requires statically sized, normalized forall op}} 126 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op 127 transform.yield 128 } 129} 130 131// ----- 132 133func.func @map_nested_forall_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) { 134 %one = arith.constant 1 : index 135 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 136 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 137 { 138 %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32> 139 gpu.terminator 140 } 141 return 142} 143 144module attributes {transform.with_named_sequence} { 145 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 146 %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op 147 %tiled, %forall = transform.structured.tile_using_forall %matmul num_threads [2, 3, 1] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] ) 148 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 149 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 150 // expected-error @below {{only bufferized scf.forall can be mapped}} 151 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [96, 4, 1] : (!transform.any_op) -> !transform.any_op 152 transform.yield 153 } 154} 155 156// ----- 157 158 159func.func @map_forall_to_blocks_not_gpu_launch() -> () { 160 // expected-note @below {{when applied to this payload op}} 161 %1 = tensor.empty() : tensor<4xf32> 162 return 163} 164 165module attributes {transform.with_named_sequence} { 166 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 167 %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!transform.any_op) -> !transform.any_op 168 // expected-error @below {{Given target is not gpu.launch}} 169 %1 = transform.gpu.map_forall_to_blocks %funcop : (!transform.any_op) -> !transform.any_op 170 transform.yield 171 } 172} 173 174// ----- 175 176func.func @map_forall_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 177 %one = arith.constant 1 : index 178 %c900 = arith.constant 900 : index 179 %c9 = arith.constant 9 : index 180 %c7 = arith.constant 7 : index 181 // expected-note @below {{when applied to this payload op}} 182 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 183 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 184 { 185 scf.forall (%i, %j) in (%c7, %c900) { 186 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 187 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 188 %6 = math.fma %alpha, %4, %5 : f32 189 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 190 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 191 192 scf.forall (%i, %j) in (%c7, %c9) { 193 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 194 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 195 %6 = math.fma %alpha, %4, %5 : f32 196 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 197 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 198 gpu.terminator 199 } 200 201 return %y : memref<2 x 32 x f32> 202} 203 204module attributes {transform.with_named_sequence} { 205 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 206 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 207 // expected-error @below {{could not find a unique topLevel scf.forall}} 208 %1 = transform.gpu.map_forall_to_blocks %funcop : (!transform.any_op) -> !transform.any_op 209 transform.yield 210 } 211} 212 213// ----- 214 215// expected-note @below {{when applied to this payload op}} 216func.func @map_forall_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 217 %one = arith.constant 1 : index 218 %c65537 = arith.constant 65536 : index 219 %c9 = arith.constant 9 : index 220 %c7 = arith.constant 7 : index 221 222 scf.forall (%i, %j) in (%c7, %c65537) { 223 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 224 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 225 %6 = math.fma %alpha, %4, %5 : f32 226 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 227 } { mapping = [#gpu.thread<x>, #gpu.thread<y>] } 228 229 scf.forall (%i, %j) in (%c7, %c9) { 230 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 231 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 232 %6 = math.fma %alpha, %4, %5 : f32 233 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 234 } { mapping = [#gpu.thread<y>, #gpu.thread<x>] } 235 236 return %y : memref<2 x 32 x f32> 237} 238 239module attributes {transform.with_named_sequence} { 240 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 241 %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op 242 // expected-error @below {{could not find a unique topLevel scf.forall}} 243 %1 = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op 244 transform.yield 245 } 246} 247 248// ----- 249 250func.func @map_forall_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 251 %one = arith.constant 1 : index 252 %c65535 = arith.constant 65535 : index 253 scf.forall (%i, %j) in (%c65535, %c65535) { 254 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 255 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 256 %6 = math.fma %alpha, %4, %5 : f32 257 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 258 } { mapping = [#gpu.block<x>, #gpu.block<y>] } 259 return %y : memref<2 x 32 x f32> 260} 261 262module attributes {transform.with_named_sequence} { 263 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 264 %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op 265 // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (65535, 65535, 1) block_dims = (1, 1, 1). It is larger than the limits.}} 266 %1 = transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch : (!transform.any_op) -> !transform.any_op 267 transform.yield 268 } 269} 270 271// ----- 272 273!type = memref<32x32xf32> 274func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { 275 %c32 = arith.constant 32 : index 276 %one = arith.constant 1 : index 277 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 278 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 279 { 280 scf.forall (%i, %j) in (%c32, %c32) { 281 %4 = memref.load %x[%i, %j] : !type 282 %5 = memref.load %y[%i, %j] : !type 283 %6 = arith.mulf %4, %5 : f32 284 memref.store %6, %y[%i, %j] : !type 285 } { mapping = [#gpu.thread<x>, #gpu.warp<y>] } 286 gpu.terminator 287 } 288 return %y : !type 289} 290 291module attributes {transform.with_named_sequence} { 292 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 293 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 294 // expected-error @below {{cannot mix different mapping types, use nesting}} 295 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op 296 transform.yield 297 } 298} 299 300// ----- 301 302!type = memref<32x32xf32> 303func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { 304 %c32 = arith.constant 32 : index 305 %one = arith.constant 1 : index 306 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 307 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 308 { 309 scf.forall (%i, %j) in (%c32, %c32) { 310 %4 = memref.load %x[%i, %j] : !type 311 %5 = memref.load %y[%i, %j] : !type 312 %6 = arith.mulf %4, %5 : f32 313 memref.store %6, %y[%i, %j] : !type 314 } { mapping = [#gpu.thread<x>, #gpu.thread<x>] } 315 gpu.terminator 316 } 317 return %y : !type 318} 319 320module attributes {transform.with_named_sequence} { 321 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 322 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 323 // expected-error @below {{duplicate attribute, cannot map different loops to the same mapping id}} 324 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op 325 transform.yield 326 } 327} 328 329// ----- 330 331!type = memref<32x32xf32> 332func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { 333 %c32 = arith.constant 32 : index 334 %one = arith.constant 1 : index 335 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 336 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 337 { 338 scf.forall (%i, %j) in (%c32, %c32) { 339 %4 = memref.load %x[%i, %j] : !type 340 %5 = memref.load %y[%i, %j] : !type 341 %6 = arith.mulf %4, %5 : f32 342 memref.store %6, %y[%i, %j] : !type 343 } { mapping = [#gpu.thread<x>, #gpu.thread<linear_dim_0>] } 344 gpu.terminator 345 } 346 return %y : !type 347} 348 349module attributes {transform.with_named_sequence} { 350 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 351 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 352 // expected-error @below {{cannot mix linear and non-linear mapping modes}} 353 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op 354 transform.yield 355 } 356} 357 358// ----- 359 360// expected-note @below {{when applied to this payload op}} 361module attributes {transform.with_named_sequence} { 362 transform.named_sequence @__transform_main(%op: !transform.any_op {transform.consumed}) { 363 // expected-error @below {{could not find a unique topLevel scf.forall}} 364 %gpu_launch = transform.gpu.map_forall_to_blocks %op generate_gpu_launch grid_dims = [1, 1, 1] 365 : (!transform.any_op) -> !transform.any_op 366 transform.yield 367 } 368} 369 370// ----- 371 372func.func public @improperly_sized_grid_dims(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) { 373 scf.forall (%arg3, %arg4) in (1, 1) { 374 linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>) 375 } {mapping = [#gpu.block<x>, #gpu.block<y>]} 376 return 377} 378 379module attributes {transform.with_named_sequence} { 380 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) { 381 %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op 382 // expected-error @below {{transform requires empty or size-3 grid_dims}} 383 %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16] : (!transform.any_op) -> !transform.any_op 384 transform.yield 385 } 386} 387 388// ----- 389 390func.func public @missing_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) { 391 scf.forall (%arg3, %arg4) in (1, 1) { 392 linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>) 393 } 394 return 395} 396 397module attributes {transform.with_named_sequence} { 398 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) { 399 %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op 400 // expected-error @below {{scf.forall op requires a mapping attribute}} 401 %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op 402 transform.yield 403 } 404} 405 406// ----- 407 408func.func public @not_a_block_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) { 409 scf.forall (%arg3, %arg4) in (1, 1) { 410 linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>) 411 } {mapping = [#gpu.thread<x>, #gpu.thread<y>]} 412 return 413} 414 415module attributes {transform.with_named_sequence} { 416 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) { 417 %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op 418 // expected-error @below {{scf.forall op requires a mapping attribute of kind 'block'}} 419 %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op 420 transform.yield 421 } 422} 423 424// ----- 425 426func.func @not_a_thread_or_warp_mapping_attribute(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> { 427 %one = arith.constant 1 : index 428 %c900 = arith.constant 900 : index 429 %c9 = arith.constant 9 : index 430 %c7 = arith.constant 7 : index 431 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) 432 threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) 433 { 434 scf.forall (%i, %j) in (%c7, %c900) { 435 %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32> 436 %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32> 437 %6 = math.fma %alpha, %4, %5 : f32 438 memref.store %6, %y[%i, %j] : memref<2 x 32 x f32> 439 } { mapping = [#gpu.block<y>, #gpu.block<x>] } 440 gpu.terminator 441 } 442 443 return %y : memref<2 x 32 x f32> 444} 445 446module attributes {transform.with_named_sequence} { 447 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 448 %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op 449 // expected-error @below {{scf.forall op requires a mapping attribute of kind 'thread' or 'warp'}} 450 transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] : (!transform.any_op) -> !transform.any_op 451 transform.yield 452 } 453} 454