xref: /llvm-project/mlir/test/Dialect/GPU/transform-gpu.mlir (revision d28a4f1fc02dc34a87fa22af0a053e8f1e7f6cea)
1// RUN: mlir-opt --transform-interpreter --split-input-file  -canonicalize -cse %s | FileCheck %s
2
3!type = memref<2 x 32 x f32>
4!type1d = memref<32 x f32>
5
6// CHECK-LABEL: func.func @blocks_3d(
7// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
8// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
9// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
10func.func @blocks_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
11  %c9 = arith.constant 9 : index
12  %c7 = arith.constant 7 : index
13  %one = arith.constant 1 : index
14//      CHECK:   gpu.launch
15//      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
16//      CHECK:   %[[BLKY:.*]] = gpu.block_id  y
17//      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]]]
18//      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]]]
19  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
20            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
21  {
22    scf.forall (%i, %j) in (%c7, %c9) {
23        %4 = memref.load %x[%i, %j] : !type
24        %5 = memref.load %y[%i, %j] : !type
25        %6 = math.fma %alpha, %4, %5 : f32
26        memref.store %6, %y[%i, %j] : !type
27     }  { mapping = [#gpu.block<x>, #gpu.block<y>]}
28    gpu.terminator
29  }
30  return %y : !type
31}
32
33module attributes {transform.with_named_sequence} {
34  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
35    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
36    transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op
37    transform.yield
38  }
39}
40
41// -----
42
43!type = memref<2 x 32 x f32>
44!type1d = memref<32 x f32>
45
46// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 floordiv 128)>
47
48// CHECK-LABEL: func.func @warpgroup_3d(
49// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
50// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
51// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
52func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
53  %c1 = arith.constant 1 : index
54  %c3 = arith.constant 3 : index
55  %one = arith.constant 1 : index
56  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
57  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
58  // CHECK-DAG: %[[C384:.*]] = arith.constant 384 : index
59  // CHECK-DAG: %[[C512:.*]] = arith.constant 512 : index
60
61//      CHECK:   gpu.launch
62//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
63//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
64//  CHECK-DAG:   %[[WG:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
65//  CHECK-DAG:   %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C384]] : index
66//  CHECK-DAG:   %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
67//      CHECK:   %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
68//      CHECK:   scf.if %[[COND]]
69//      CHECK:     memref.load %[[ARGX]][%[[WG]], %[[TIDY]]]
70//      CHECK:     memref.load %[[ARGY]][%[[WG]], %[[TIDY]]]
71  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
72            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
73  {
74    scf.forall (%i, %j) in (%c3, %c1) {
75        %4 = memref.load %x[%i, %j] : !type
76        %5 = memref.load %y[%i, %j] : !type
77        %6 = math.fma %alpha, %4, %5 : f32
78        memref.store %6, %y[%i, %j] : !type
79     }  { mapping = [#gpu.warpgroup<x>, #gpu.warpgroup<y>]}
80    gpu.terminator
81  }
82  return %y : !type
83}
84
85module attributes {transform.with_named_sequence} {
86  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
87    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
88    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [512, 2, 1] : (!transform.any_op) -> !transform.any_op
89    transform.yield
90  }
91}
92
93// -----
94
95!type = memref<2 x 32 x f32>
96!type1d = memref<32 x f32>
97
98// CHECK-DAG: #map = affine_map<()[s0] -> (s0 floordiv 16)>
99
100// CHECK-LABEL: func.func @warp_3d(
101// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
102// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
103// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
104func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
105  %c2 = arith.constant 2 : index
106  %c3 = arith.constant 3 : index
107  %one = arith.constant 1 : index
108  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
109  // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
110  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
111  // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
112  // CHECK-DAG: %[[c64:.*]] = arith.constant 64 : index
113
114//      CHECK:   gpu.launch
115//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
116//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
117//  CHECK-DAG:   %[[W:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]]
118//  CHECK-DAG:   %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
119//  CHECK-DAG:   %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index
120//      CHECK:   %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1
121//      CHECK:   scf.if %[[COND]]
122//      CHECK:     memref.load %[[ARGX]][%[[W]], %[[TIDY]]]
123//      CHECK:     memref.load %[[ARGY]][%[[W]], %[[TIDY]]]
124  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
125            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
126  {
127    scf.forall (%i, %j, %k) in (%c2, %c3, %c3) {
128        %4 = memref.load %x[%i, %j] : !type
129        %5 = memref.load %y[%i, %j] : !type
130        %6 = math.fma %alpha, %4, %5 : f32
131        memref.store %6, %y[%i, %j] : !type
132     }  { mapping = [#gpu.warp<x>, #gpu.warp<y>, #gpu.warp<z>]}
133    gpu.terminator
134  }
135  return %y : !type
136}
137
138module attributes {transform.with_named_sequence} {
139  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
140    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
141    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [64, 4, 3] warp_size = 16: (!transform.any_op) -> !transform.any_op
142    transform.yield
143  }
144}
145
146// -----
147
148!type = memref<2 x 32 x f32>
149!type1d = memref<32 x f32>
150
151// CHECK-LABEL: func.func @threads_3d(
152// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
153// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
154// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
155func.func @threads_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
156  %one = arith.constant 1 : index
157  %c12 = arith.constant 12 : index
158  %c9 = arith.constant 9 : index
159  %c7 = arith.constant 7 : index
160//      CHECK:   %[[C1:.*]] = arith.constant 1 : index
161//      CHECK:   %[[C12:.*]] = arith.constant 12 : index
162//      CHECK:   %[[C9:.*]] = arith.constant 9 : index
163//      CHECK:   %[[C7:.*]] = arith.constant 7 : index
164//      CHECK:   gpu.launch async [%{{.*}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]])
165//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
166//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
167//      CHECK:   arith.cmpi ult, %[[TIDX]], %[[C9]] : index
168//      CHECK:   arith.cmpi ult, %[[TIDY]], %[[C7]] : index
169//      CHECK:   memref.load %[[ARGX]][%[[TIDY]], %[[TIDX]]]
170//      CHECK:   memref.load %[[ARGY]][%[[TIDY]], %[[TIDX]]]
171//      CHECK:   gpu.barrier
172//      CHECK:   arith.cmpi ult, %[[TIDY]], %[[C1]] : index
173//      CHECK:   memref.load %[[ARGT]][%[[TIDX]]]
174//      CHECK:   gpu.barrier
175  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
176            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
177  {
178    scf.forall (%i, %j) in (%c7, %c9) {
179        %4 = memref.load %x[%i, %j] : !type
180        %5 = memref.load %y[%i, %j] : !type
181        %6 = math.fma %alpha, %4, %5 : f32
182        memref.store %6, %y[%i, %j] : !type
183     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
184     scf.forall (%i) in (%c12) {
185        %7 = memref.load %t[%i] : !type1d
186        %8 = arith.addf %alpha, %7 : f32
187        memref.store %8, %t[%i] : !type1d
188     }  {mapping = [#gpu.thread<x>] }
189    gpu.terminator
190  }
191  return %y : !type
192}
193
194module attributes {transform.with_named_sequence} {
195  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
196    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
197    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op
198    transform.yield
199  }
200}
201
202// -----
203
204!type4d = memref<32x64x4x32xf32>
205
206// CHECK-LABEL: func.func @saxpy4d(
207// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x64x4x32xf32>
208// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x64x4x32xf32>
209func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
210  %c32 = arith.constant 32 : index
211  %c64 = arith.constant 64 : index
212  %c4 = arith.constant 4 : index
213//      CHECK:   %[[C32:.*]] = arith.constant 32 : index
214//      CHECK:   %[[C64:.*]] = arith.constant 64 : index
215//      CHECK:   %[[C4:.*]] = arith.constant 4 : index
216//      CHECK:   %[[C1:.*]] = arith.constant 1 : index
217//      CHECK:   gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]])
218//      CHECK:   %[[BLKX:.*]] = gpu.block_id  x
219//      CHECK:   %[[BLKY:.*]] = gpu.block_id  y
220//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
221//      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
222//      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
223//      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
224  scf.forall (%i, %j) in (%c32, %c64) {
225    scf.forall (%k, %l) in (%c4, %c32) {
226      %4 = memref.load %x[%i, %j, %k, %l] : !type4d
227      %5 = memref.load %y[%i, %j, %k, %l] : !type4d
228      %6 = math.fma %alpha, %4, %5 : f32
229      memref.store %6, %y[%i, %j, %k, %l] : !type4d
230    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
231  }  { mapping = [#gpu.block<x>, #gpu.block<y>] }
232  return %y : !type4d
233}
234
235module attributes {transform.with_named_sequence} {
236  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
237    %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
238    %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
239    transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1] : (!transform.any_op) -> !transform.any_op
240    transform.yield
241  }
242}
243
244// -----
245
246!type = memref<2 x 32 x f32>
247!type1d = memref<32 x f32>
248
249// CHECK-LABEL: func.func @saxpy2d_no_barrier(
250func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
251  %one = arith.constant 1 : index
252  %c12 = arith.constant 12 : index
253  %c9 = arith.constant 9 : index
254  %c7 = arith.constant 7 : index
255//  CHECK-NOT:   gpu.barrier
256//      CHECK:   return
257  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
258            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
259  {
260    scf.forall (%i, %j) in (%c7, %c9) {
261        %4 = memref.load %x[%i, %j] : !type
262        %5 = memref.load %y[%i, %j] : !type
263        %6 = math.fma %alpha, %4, %5 : f32
264        memref.store %6, %y[%i, %j] : !type
265     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
266    gpu.terminator
267  }
268  return %y : !type
269}
270
271module attributes {transform.with_named_sequence} {
272  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
273    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
274    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false : (!transform.any_op) -> !transform.any_op
275    transform.yield
276  }
277}
278
279// -----
280
281!type = memref<32x32xf32>
282// CHECK-LABEL: func.func @saxpy2d_singleloop(
283// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<32x32xf32>
284// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<32x32xf32>
285func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
286  %c32 = arith.constant 32 : index
287  %one = arith.constant 1 : index
288  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
289            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
290  {
291//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
292//      CHECK:   memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]]
293//      CHECK:   memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
294    scf.forall (%i) in (%c32) {
295        %4 = memref.load %x[%i, %i] : !type
296        %5 = memref.load %y[%i, %i] : !type
297        %6 = arith.mulf %4, %5 : f32
298        memref.store %6, %y[%i, %i] : !type
299     }  { mapping = [#gpu.thread<x>] }
300    gpu.terminator
301  }
302  return %y : !type
303}
304
305module attributes {transform.with_named_sequence} {
306  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
307    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
308    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 1, 1] : (!transform.any_op) -> !transform.any_op
309    transform.yield
310  }
311}
312
313// -----
314
315!type = memref<3 x 2 x 32 x f32>
316!type1d = memref<32 x f32>
317
318// CHECK-LABEL: func.func @saxpy3d_fold_id_z(
319func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
320  %one = arith.constant 1 : index
321  %c12 = arith.constant 12 : index
322  %c9 = arith.constant 9 : index
323  %c7 = arith.constant 7 : index
324//  CHECK: %[[C0:.+]] = arith.constant 0 : index
325//  CHECK-NOT:   gpu.thread_id  z
326  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
327            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
328  {
329    scf.forall (%i, %j, %k) in (%one, %c7, %c9) {
330//      CHECK:   memref.load %{{.*}}[%[[C0]],
331//      CHECK:   memref.load %{{.*}}[%[[C0]],
332        %4 = memref.load %x[%i, %j, %k] : !type
333        %5 = memref.load %y[%i, %j, %k] : !type
334        %6 = math.fma %alpha, %4, %5 : f32
335//      CHECK:   memref.store %{{.*}}, %{{.*}}[%[[C0]]
336        memref.store %6, %y[%i, %j, %k] : !type
337     }  { mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] }
338    gpu.terminator
339  }
340  return %y : !type
341}
342
343module attributes {transform.with_named_sequence} {
344  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
345    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
346    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false : (!transform.any_op) -> !transform.any_op
347    transform.yield
348  }
349}
350
351
352// -----
353
354!type = memref<2 x 32 x f32>
355!type1d = memref<32 x f32>
356
357// CHECK-DAG: #[[$MAPWGLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
358// CHECK-DAG: #[[$MAPWGX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 32) floordiv 128) mod 2)>
359// CHECK-DAG: #[[$MAPWGY:.*]] = affine_map<()[s0, s1, s2] -> (s2 + ((s0 + s1 * 32) floordiv 128) floordiv 2)>
360
361// CHECK-LABEL: func.func @warpgroup_linear(
362// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
363// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
364// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
365func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
366  %c2 = arith.constant 2 : index
367  %c3 = arith.constant 3 : index
368  %one = arith.constant 1 : index
369
370// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
371// CHECK-DAG: %[[C768:.*]] = arith.constant 768 : index
372// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
373// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
374// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
375
376// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
377// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
378// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id  z
379// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
380// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]]()[%[[TIDX]], %[[TIDY]]]
381// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
382// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C768]] : index
383//     CHECK: scf.if %[[CMPLIN]]
384//      CHECK:   memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
385//      CHECK:   memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]]
386  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
387            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
388  {
389    scf.forall (%i, %j) in (%c2, %c3) {
390        %4 = memref.load %x[%i, %j] : !type
391        %5 = memref.load %y[%i, %j] : !type
392        %6 = math.fma %alpha, %4, %5 : f32
393        memref.store %6, %y[%i, %j] : !type
394     }  { mapping = [#gpu.warpgroup<linear_dim_0>, #gpu.warpgroup<linear_dim_1>]}
395    gpu.terminator
396  }
397  return %y : !type
398}
399
400module attributes {transform.with_named_sequence} {
401  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
402    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
403    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op
404    transform.yield
405  }
406}
407
408// -----
409
410!type = memref<2 x 32 x f32>
411!type1d = memref<32 x f32>
412
413// CHECK-DAG: #[[$MAPWLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 256)>
414// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) mod 2)>
415// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1, s2] -> ((s1 + s2 * 8 + s0 floordiv 32) floordiv 2)>
416
417// CHECK-LABEL: func.func @warp_linear(
418// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
419// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
420// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
421func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
422  %c2 = arith.constant 2 : index
423  %c3 = arith.constant 3 : index
424  %one = arith.constant 1 : index
425
426// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
427// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
428// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
429// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
430// CHECK-DAG: %[[C192:.*]] = arith.constant 192 : index
431
432// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
433// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
434// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id  z
435// CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
436// CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
437// CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
438// CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[WIDLIN]], %[[C192]] : index
439//     CHECK: scf.if %[[CMPLIN]]
440//      CHECK:   memref.load %[[ARGX]][%[[WIDX]], %[[WIDY]]]
441//      CHECK:   memref.load %[[ARGY]][%[[WIDX]], %[[WIDY]]]
442  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
443            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
444  {
445    scf.forall (%i, %j) in (%c2, %c3) {
446        %4 = memref.load %x[%i, %j] : !type
447        %5 = memref.load %y[%i, %j] : !type
448        %6 = math.fma %alpha, %4, %5 : f32
449        memref.store %6, %y[%i, %j] : !type
450     }  { mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>]}
451    gpu.terminator
452  }
453  return %y : !type
454}
455
456module attributes {transform.with_named_sequence} {
457  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
458    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
459    transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 8, 4] : (!transform.any_op) -> !transform.any_op
460    transform.yield
461  }
462}
463
464// -----
465
466!type = memref<2 x 32 x f32>
467!type1d = memref<32 x f32>
468
469// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<()[s0, s1] -> (((s0 + s1 * 18) floordiv 32) mod 3)>
470// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 18) floordiv 32) mod 6) floordiv 3)>
471
472// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 18)>
473// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) mod 10)>
474// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 18) floordiv 10)>
475
476// CHECK-LABEL: func.func @map_multi_level_linear(
477func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
478  %one = arith.constant 1 : index
479  %c10 = arith.constant 10 : index
480  %c9 = arith.constant 9 : index
481  %c7 = arith.constant 7 : index
482  %c1 = arith.constant 1 : index
483  %c2 = arith.constant 2 : index
484  %c3 = arith.constant 3 : index
485
486  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
487  // CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index
488  // CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index
489  // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index
490  // CHECK-DAG: %[[C192:.*]] = arith.constant 192 : index
491
492  // check that both the thread level and the warp level got distributed.
493  //  CHECK-NOT: #gpu.thread
494  //  CHECK-NOT: #gpu.warp
495  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
496            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
497  {
498    // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
499    // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
500    scf.forall (%i, %j) in (%c7, %c9) {
501      %4 = memref.load %x[%i, %j] : !type
502      %5 = memref.load %y[%i, %j] : !type
503      %6 = math.fma %alpha, %4, %5 : f32
504      memref.store %6, %y[%i, %j] : !type
505    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
506
507    // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]]()[%[[TIDX]], %[[TIDY]]]
508    // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]]]
509    // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]]]
510    // CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[LIN]], %[[C192]] : index
511    //     CHECK: scf.if %[[CMPLIN]]
512    scf.forall (%i, %j, %k) in (%c3, %c2, %c1) {
513        %7 = memref.load %x[%i, %j] : !type
514        %8 = arith.addf %alpha, %7 : f32
515        memref.store %8, %y[%i, %j] : !type
516     }  {mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_2>] }
517
518    // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]]()[%[[TIDX]], %[[TIDY]]]
519    // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]]()[%[[TIDX]], %[[TIDY]]]
520    // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
521    //     CHECK: scf.if %[[COND]]
522    //     CHECK:   memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32>
523    //     CHECK:   memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32>
524    scf.forall (%i, %j) in (%c10, %c2) {
525        %7 = memref.load %t[%i] : !type1d
526        %8 = arith.addf %alpha, %7 : f32
527        memref.store %8, %t[%j] : !type1d
528     }  {mapping = [#gpu.thread<linear_dim_0>, #gpu.thread<linear_dim_1>] }
529    gpu.terminator
530  }
531  return %y : !type
532}
533
534module attributes {transform.with_named_sequence} {
535  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
536    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
537    transform.gpu.map_nested_forall_to_threads %funcop
538      block_dims = [18, 11, 1] : (!transform.any_op) -> !transform.any_op
539      transform.yield
540  }
541}
542
543// -----
544
545!type = memref<2 x 32 x f32>
546!type1d = memref<32 x f32>
547
548// CHECK-DAG: #[[$MAPBLIN:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 12 + s2 * 108)>
549// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) mod 7)>
550// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> ((s0 + s1 * 12 + s2 * 108) floordiv 7)>
551
552// CHECK-LABEL: func.func @block_linear_existing_launch(
553// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
554// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
555// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
556func.func @block_linear_existing_launch(
557    %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
558  %c9 = arith.constant 9 : index
559  %c7 = arith.constant 7 : index
560  %one = arith.constant 1 : index
561  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
562  // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
563  // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
564  // CHECK-DAG: %[[C63:.*]] = arith.constant 63 : index
565//      CHECK:   gpu.launch async [{{.*}}] blocks({{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads
566//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id  x
567//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id  y
568//  CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id  z
569//  CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
570//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
571//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
572//  CHECK-DAG: %[[CMPLIN:.*]] = arith.cmpi ult, %[[BIDLIN]], %[[C63]] : index
573//     CHECK: scf.if %[[CMPLIN]]
574//      CHECK:   memref.load %[[ARGX]][%[[BLX]], %[[BLY]]]
575//      CHECK:   memref.load %[[ARGY]][%[[BLX]], %[[BLY]]]
576  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
577            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
578  {
579    scf.forall (%i, %j) in (%c7, %c9) {
580        %4 = memref.load %x[%i, %j] : !type
581        %5 = memref.load %y[%i, %j] : !type
582        %6 = math.fma %alpha, %4, %5 : f32
583        memref.store %6, %y[%i, %j] : !type
584     }  { mapping = [#gpu.block<linear_dim_0>, #gpu.block<linear_dim_1>]}
585    gpu.terminator
586  }
587  return %y : !type
588}
589
590module attributes {transform.with_named_sequence} {
591  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
592    %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
593    transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1] : (!transform.any_op) -> !transform.any_op
594    transform.yield
595  }
596}
597
598// -----
599
600!type = memref<2 x 32 x f32>
601!type1d = memref<32 x f32>
602
603// CHECK-DAG: #[[$MAPBX:.*]] = affine_map<()[s0] -> (s0 mod 7)>
604// CHECK-DAG: #[[$MAPBY:.*]] = affine_map<()[s0, s1, s2] -> (s1 + s2 * 9 + s0 floordiv 7)>
605
606// CHECK-LABEL: func.func @block_linear_generate_launch(
607// CHECK-SAME:    %[[ARGX:[0-9a-z]+]]: memref<2x32xf32>
608// CHECK-SAME:    %[[ARGY:[0-9a-z]+]]: memref<2x32xf32>
609// CHECK-SAME:    %[[ARGT:[0-9a-z]+]]: memref<32xf32>
610func.func @block_linear_generate_launch(
611    %x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
612  %c9 = arith.constant 9 : index
613  %c7 = arith.constant 7 : index
614  %one = arith.constant 1 : index
615
616  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
617  // CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index
618  // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
619//      CHECK:   gpu.launch blocks({{.*}}) in (%{{.*}} = %[[C7]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads
620//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id  x
621//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id  y
622//  CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id  z
623//  CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]]]
624//  CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]]
625//      CHECK:   memref.load %[[ARGX]][%[[BLX]], %[[BLY]]]
626//      CHECK:   memref.load %[[ARGY]][%[[BLX]], %[[BLY]]]
627  scf.forall (%i, %j) in (%c7, %c9) {
628    %4 = memref.load %x[%i, %j] : !type
629    %5 = memref.load %y[%i, %j] : !type
630    %6 = math.fma %alpha, %4, %5 : f32
631    memref.store %6, %y[%i, %j] : !type
632  }  { mapping = [#gpu.block<linear_dim_0>, #gpu.block<linear_dim_1>]}
633
634  return %y : !type
635}
636
637module attributes {transform.with_named_sequence} {
638  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
639    %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
640    transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch : (!transform.any_op) -> !transform.any_op
641    transform.yield
642  }
643}
644
645// -----
646
647#map = affine_map<(d0) -> (d0 *  128)>
648#map1 = affine_map<(d0) -> (d0 * 32)>
649
650// CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)>
651// CHECK-DAG: #[[$MAPW:.*]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
652
653// CHECK-LABEL: func.func @simple_fill(
654func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> {
655  %c0 = arith.constant 0 : index
656  %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
657//       CHECK:   %[[C1:.*]] = arith.constant 1 : index
658//       CHECK:   %[[C4:.*]] = arith.constant 4 : index
659//       CHECK:   %[[C8:.*]] = arith.constant 8 : index
660//       CHECK:   gpu.launch
661  scf.forall (%arg1) in (1) {
662//       CHECK:     %[[BIDX:.*]] = gpu.block_id  x
663//       CHECK:     %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
664    %0 = affine.apply #map(%arg1)
665    %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>>
666    scf.forall (%arg2) in (4) {
667//       CHECK:     %[[TIDX:.*]] = gpu.thread_id  x
668//       CHECK:     %[[TIDY:.*]] = gpu.thread_id  y
669//       CHECK:     %[[TIDZ:.*]] = gpu.thread_id  z
670//       CHECK:     %[[THX:.*]] = affine.apply #[[$MAPW]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]]
671//   CHECK-NOT:     scf.if
672//       CHECK:       memref.subview %{{.*}}[%[[THX]]]
673      %1 = affine.apply #map1(%arg2)
674      %subview_0 = memref.subview %subview[%1] [32] [1] : memref<128xf32, strided<[1], offset: ?>> to memref<32xf32, strided<[1], offset: ?>>
675      vector.transfer_write %cst, %subview_0[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, strided<[1], offset: ?>>
676      memref.copy %subview_0, %subview_0 : memref<32xf32, strided<[1], offset: ?>> to memref<32xf32, strided<[1], offset: ?>>
677    } {mapping = [#gpu.warp<linear_dim_0>]}
678    memref.copy %subview, %subview : memref<128xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>>
679  } {mapping = [#gpu.block<x>]}
680  return %arg0 : memref<128xf32>
681}
682
683module attributes {transform.with_named_sequence} {
684  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
685    %func = transform.structured.match ops{["func.func"]} in %module_op
686      : (!transform.any_op) -> !transform.any_op
687    %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
688      : (!transform.any_op) -> !transform.any_op
689    transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [4, 8, 4]
690      : (!transform.any_op) -> !transform.any_op
691      transform.yield
692  }
693}
694