xref: /llvm-project/mlir/test/Dialect/GPU/transform-gpu-failing.mlir (revision ca5d34ec7186f2b5750c7e67dcb8b2d0dc865d8d)
1// RUN: mlir-opt --transform-interpreter --split-input-file  -canonicalize -cse --verify-diagnostics %s
2
3func.func @map_nested_forall_to_threads_not_gpu_launch() -> () {
4  %1 = tensor.empty() : tensor<4xf32>
5  return
6}
7module attributes {transform.with_named_sequence} {
8  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
9    %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!transform.any_op) -> !transform.any_op
10    // expected-error @below {{Given target is not a gpu.launch}}
11    %1 = transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] : (!transform.any_op) -> !transform.any_op
12    transform.yield
13  }
14}
15
16// -----
17
18func.func @map_nested_forall_to_threads_excessive_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
19  %one = arith.constant 1 : index
20  %c900 = arith.constant 900 : index
21  %c9 = arith.constant 9 : index
22  %c7 = arith.constant 7 : index
23  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
24            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
25  {
26    scf.forall (%i, %j) in (%c7, %c900) {
27        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
28        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
29        %6 = math.fma %alpha, %4, %5 : f32
30        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
31     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
32    gpu.terminator
33  }
34
35  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
36            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
37  {
38    scf.forall (%i, %j) in (%c7, %c9) {
39        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
40        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
41        %6 = math.fma %alpha, %4, %5 : f32
42        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
43     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
44    gpu.terminator
45  }
46
47  return %y : memref<2 x 32 x f32>
48}
49
50module attributes {transform.with_named_sequence} {
51  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
52    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
53    // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (1, 1, 1) block_dims = (1200, 9, 1). It is larger than the limits.}}
54    // expected-note @below {{"block_dims" is too large}}
55    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1200, 9, 1] : (!transform.any_op) -> !transform.any_op
56    transform.yield
57  }
58}
59
60// -----
61
62func.func @map_nested_forall_to_threads_fewer_threads(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
63  %one = arith.constant 1 : index
64  %c900 = arith.constant 900 : index
65  %c9 = arith.constant 9 : index
66  %c7 = arith.constant 7 : index
67  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
68            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
69  {
70    scf.forall (%i, %j) in (%c7, %c900) {
71        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
72        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
73        %6 = math.fma %alpha, %4, %5 : f32
74        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
75     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
76    gpu.terminator
77  }
78
79  %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
80            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
81  {
82    scf.forall (%i, %j) in (%c7, %c9) {
83        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
84        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
85        %6 = math.fma %alpha, %4, %5 : f32
86        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
87     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
88    gpu.terminator
89  }
90
91  return %y : memref<2 x 32 x f32>
92}
93
94module attributes {transform.with_named_sequence} {
95  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
96    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
97    // expected-error @below {{the number of required parallel resources (blocks or threads) 6300 overflows the number of available resources 512}}
98    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op
99    transform.yield
100  }
101}
102
103// -----
104
105func.func @map_nested_forall_to_threads_dynamic_trip_count(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token, %c9 : index, %c7 : index) -> memref<2 x 32 x f32> {
106  %one = arith.constant 1 : index
107  %c900 = arith.constant 900 : index
108  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
109            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
110  {
111    scf.forall (%i, %j) in (%c7, %c900) {
112        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
113        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
114        %6 = math.fma %alpha, %4, %5 : f32
115        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
116     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
117    gpu.terminator
118  }
119  return %y : memref<2 x 32 x f32>
120}
121
122module attributes {transform.with_named_sequence} {
123  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
124    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
125    // expected-error @below {{requires statically sized, normalized forall op}}
126    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1] : (!transform.any_op) -> !transform.any_op
127    transform.yield
128  }
129}
130
131// -----
132
133func.func @map_nested_forall_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {
134  %one = arith.constant 1 : index
135  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
136            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
137  {
138    %t = linalg.matmul ins(%x, %y: tensor<32x32xf32>, tensor<32x32xf32>) outs(%z : tensor<32x32xf32>) -> tensor<32x32xf32>
139    gpu.terminator
140  }
141  return
142}
143
144module attributes {transform.with_named_sequence} {
145  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
146    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
147    %tiled, %forall = transform.structured.tile_using_forall %matmul num_threads [2, 3, 1] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
148      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
149    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
150    // expected-error @below {{only bufferized scf.forall can be mapped}}
151    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [96, 4, 1] : (!transform.any_op) -> !transform.any_op
152    transform.yield
153  }
154}
155
156// -----
157
158
159func.func @map_forall_to_blocks_not_gpu_launch() -> () {
160  // expected-note @below {{when applied to this payload op}}
161  %1 = tensor.empty() : tensor<4xf32>
162  return
163}
164
165module attributes {transform.with_named_sequence} {
166  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
167    %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!transform.any_op) -> !transform.any_op
168    // expected-error @below {{Given target is not gpu.launch}}
169    %1 = transform.gpu.map_forall_to_blocks %funcop : (!transform.any_op) -> !transform.any_op
170    transform.yield
171  }
172}
173
174// -----
175
176func.func @map_forall_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
177  %one = arith.constant 1 : index
178  %c900 = arith.constant 900 : index
179  %c9 = arith.constant 9 : index
180  %c7 = arith.constant 7 : index
181  // expected-note @below {{when applied to this payload op}}
182  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
183            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
184  {
185    scf.forall (%i, %j) in (%c7, %c900) {
186        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
187        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
188        %6 = math.fma %alpha, %4, %5 : f32
189        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
190     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
191
192    scf.forall (%i, %j) in (%c7, %c9) {
193        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
194        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
195        %6 = math.fma %alpha, %4, %5 : f32
196        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
197     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
198    gpu.terminator
199  }
200
201  return %y : memref<2 x 32 x f32>
202}
203
204module attributes {transform.with_named_sequence} {
205  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
206    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
207    // expected-error @below {{could not find a unique topLevel scf.forall}}
208    %1 = transform.gpu.map_forall_to_blocks %funcop : (!transform.any_op) -> !transform.any_op
209    transform.yield
210  }
211}
212
213// -----
214
215// expected-note @below {{when applied to this payload op}}
216func.func @map_forall_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
217  %one = arith.constant 1 : index
218  %c65537 = arith.constant 65536 : index
219  %c9 = arith.constant 9 : index
220  %c7 = arith.constant 7 : index
221
222  scf.forall (%i, %j) in (%c7, %c65537) {
223      %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
224      %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
225      %6 = math.fma %alpha, %4, %5 : f32
226      memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
227  }  { mapping = [#gpu.thread<x>, #gpu.thread<y>] }
228
229  scf.forall (%i, %j) in (%c7, %c9) {
230      %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
231      %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
232      %6 = math.fma %alpha, %4, %5 : f32
233      memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
234  }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
235
236  return %y : memref<2 x 32 x f32>
237}
238
239module attributes {transform.with_named_sequence} {
240  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
241    %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
242    // expected-error @below {{could not find a unique topLevel scf.forall}}
243    %1 = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
244    transform.yield
245  }
246}
247
248// -----
249
250func.func @map_forall_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
251  %one = arith.constant 1 : index
252  %c65535 = arith.constant 65535 : index
253  scf.forall (%i, %j) in (%c65535, %c65535) {
254      %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
255      %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
256      %6 = math.fma %alpha, %4, %5 : f32
257      memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
258  }  { mapping = [#gpu.block<x>, #gpu.block<y>] }
259  return %y : memref<2 x 32 x f32>
260}
261
262module attributes {transform.with_named_sequence} {
263  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
264    %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
265    // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (65535, 65535, 1) block_dims = (1, 1, 1). It is larger than the limits.}}
266    %1 = transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch : (!transform.any_op) -> !transform.any_op
267    transform.yield
268  }
269}
270
271// -----
272
273!type = memref<32x32xf32>
274func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
275  %c32 = arith.constant 32 : index
276  %one = arith.constant 1 : index
277  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
278            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
279  {
280    scf.forall (%i, %j) in (%c32, %c32) {
281        %4 = memref.load %x[%i, %j] : !type
282        %5 = memref.load %y[%i, %j] : !type
283        %6 = arith.mulf %4, %5 : f32
284        memref.store %6, %y[%i, %j] : !type
285     }  { mapping = [#gpu.thread<x>, #gpu.warp<y>] }
286    gpu.terminator
287  }
288  return %y : !type
289}
290
291module attributes {transform.with_named_sequence} {
292  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
293    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
294    // expected-error @below {{cannot mix different mapping types, use nesting}}
295    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op
296    transform.yield
297  }
298}
299
300// -----
301
302!type = memref<32x32xf32>
303func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
304  %c32 = arith.constant 32 : index
305  %one = arith.constant 1 : index
306  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
307            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
308  {
309    scf.forall (%i, %j) in (%c32, %c32) {
310        %4 = memref.load %x[%i, %j] : !type
311        %5 = memref.load %y[%i, %j] : !type
312        %6 = arith.mulf %4, %5 : f32
313        memref.store %6, %y[%i, %j] : !type
314     }  { mapping = [#gpu.thread<x>, #gpu.thread<x>] }
315    gpu.terminator
316  }
317  return %y : !type
318}
319
320module attributes {transform.with_named_sequence} {
321  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
322    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
323    // expected-error @below {{duplicate attribute, cannot map different loops to the same mapping id}}
324    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op
325    transform.yield
326  }
327}
328
329// -----
330
331!type = memref<32x32xf32>
332func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
333  %c32 = arith.constant 32 : index
334  %one = arith.constant 1 : index
335  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
336            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
337  {
338    scf.forall (%i, %j) in (%c32, %c32) {
339        %4 = memref.load %x[%i, %j] : !type
340        %5 = memref.load %y[%i, %j] : !type
341        %6 = arith.mulf %4, %5 : f32
342        memref.store %6, %y[%i, %j] : !type
343     }  { mapping = [#gpu.thread<x>, #gpu.thread<linear_dim_0>] }
344    gpu.terminator
345  }
346  return %y : !type
347}
348
349module attributes {transform.with_named_sequence} {
350  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
351    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
352    // expected-error @below {{cannot mix linear and non-linear mapping modes}}
353    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op
354    transform.yield
355  }
356}
357
358// -----
359
360// expected-note @below {{when applied to this payload op}}
361module attributes {transform.with_named_sequence} {
362  transform.named_sequence @__transform_main(%op: !transform.any_op {transform.consumed}) {
363    // expected-error @below {{could not find a unique topLevel scf.forall}}
364    %gpu_launch = transform.gpu.map_forall_to_blocks %op generate_gpu_launch grid_dims = [1, 1, 1]
365      : (!transform.any_op) -> !transform.any_op
366    transform.yield
367  }
368}
369
370// -----
371
372func.func public @improperly_sized_grid_dims(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
373  scf.forall (%arg3, %arg4) in (1, 1) {
374    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
375  } {mapping = [#gpu.block<x>, #gpu.block<y>]}
376  return
377}
378
379module attributes {transform.with_named_sequence} {
380  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
381    %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
382    // expected-error @below {{transform requires empty or size-3 grid_dims}}
383    %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16] : (!transform.any_op) -> !transform.any_op
384    transform.yield
385  }
386}
387
388// -----
389
390func.func public @missing_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
391  scf.forall (%arg3, %arg4) in (1, 1) {
392    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
393  }
394  return
395}
396
397module attributes {transform.with_named_sequence} {
398  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
399    %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
400    // expected-error @below {{scf.forall op requires a mapping attribute}}
401    %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op
402    transform.yield
403  }
404}
405
406// -----
407
408func.func public @not_a_block_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
409  scf.forall (%arg3, %arg4) in (1, 1) {
410    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
411  } {mapping = [#gpu.thread<x>, #gpu.thread<y>]}
412  return
413}
414
415module attributes {transform.with_named_sequence} {
416  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
417    %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
418    // expected-error @below {{scf.forall op requires a mapping attribute of kind 'block'}}
419    %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op
420    transform.yield
421  }
422}
423
424// -----
425
426func.func @not_a_thread_or_warp_mapping_attribute(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
427  %one = arith.constant 1 : index
428  %c900 = arith.constant 900 : index
429  %c9 = arith.constant 9 : index
430  %c7 = arith.constant 7 : index
431  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
432            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
433  {
434    scf.forall (%i, %j) in (%c7, %c900) {
435        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
436        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
437        %6 = math.fma %alpha, %4, %5 : f32
438        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
439     }  { mapping = [#gpu.block<y>, #gpu.block<x>] }
440    gpu.terminator
441  }
442
443  return %y : memref<2 x 32 x f32>
444}
445
446module attributes {transform.with_named_sequence} {
447  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
448    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
449    // expected-error @below {{scf.forall op requires a mapping attribute of kind 'thread' or 'warp'}}
450    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] : (!transform.any_op) -> !transform.any_op
451    transform.yield
452  }
453}
454