xref: /llvm-project/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir (revision e4384149b58f7c3d19c5d38bc46038c660b77ca9)
1// RUN: mlir-opt -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect %s | FileCheck %s
2
3
4!tt = tensor<8xf16>
5
6// CHECK-LABEL: func @copy_1d_8xf16
7func.func @copy_1d_8xf16(%t0: !tt, %out: !tt) -> !tt {
8  /// Too little data for all threads, needs predication, while keeping most
9  /// minor transfer size -> 1 thread.
10  // CHECK: scf.forall {{.*}} in (1) {{.*}}
11  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
12  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
13  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
14  return %0 : !tt
15}
16
17module attributes {transform.with_named_sequence} {
18  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
19    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
20      : (!transform.any_op) -> !transform.any_op
21    transform.structured.gpu.map_copy_to_threads %0
22      total_num_threads = 32 desired_bit_alignment = 128
23        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
24    transform.yield
25  }
26}
27
28// -----
29
30!tt = tensor<8xf16>
31!tin = tensor<?xf16>
32
33// CHECK-LABEL: func @pad_1d_8xf16
34func.func @pad_1d_8xf16(%t0: !tin, %sz: index) -> !tt {
35  %cst = arith.constant 0.0 : f16
36  /// Too little data for all threads, needs predication, while keeping most
37  /// minor transfer size -> 1 thread.
38  // CHECK: scf.forall {{.*}} in (1) {{.*}}
39  // CHECK:   %[[padded:.*]] = tensor.pad {{.*}}
40  // CHECK:   tensor.cast %[[padded]] : tensor<?xf16> to tensor<8xf16>
41  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
42  %0 = tensor.pad %t0 low[0] high[%sz] {
43  ^bb0(%arg0: index):
44    tensor.yield %cst : f16
45  } : !tin to !tt
46  return %0 : !tt
47}
48
49module attributes {transform.with_named_sequence} {
50  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
51    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
52      : (!transform.any_op) -> !transform.any_op
53    transform.structured.gpu.map_copy_to_threads %0
54      total_num_threads = 32 desired_bit_alignment = 128
55        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"tensor.pad">)
56    transform.yield
57  }
58}
59
60// -----
61
62!tt = tensor<16xf16>
63
64// CHECK-LABEL: func @copy_1d_16xf16
65func.func @copy_1d_16xf16(%t0: !tt, %out: !tt) -> !tt {
66  /// Too little data for all threads, needs predication, while keeping most
67  /// minor transfer size -> 2 threads.
68  // CHECK: scf.forall {{.*}} in (2) {{.*}}
69  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
70  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
71  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
72  return %0 : !tt
73}
74
75module attributes {transform.with_named_sequence} {
76  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
77    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
78      : (!transform.any_op) -> !transform.any_op
79    transform.structured.gpu.map_copy_to_threads %0
80      total_num_threads = 32 desired_bit_alignment = 128
81        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
82    transform.yield
83  }
84}
85
86// -----
87
88!tt = tensor<20xf16>
89
90// CHECK-LABEL: func @copy_1d_20xf16
91func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt {
92  /// Too little data for all threads, needs predication, while keeping most
93  /// minor transfer size -> 5 threads.
94  // CHECK: scf.forall {{.*}} in (5) {{.*}}
95  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
96  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
97  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
98  return %0 : !tt
99}
100
101module attributes {transform.with_named_sequence} {
102  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
103    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
104      : (!transform.any_op) -> !transform.any_op
105    transform.structured.gpu.map_copy_to_threads %0
106      total_num_threads = 32 desired_bit_alignment = 128
107        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
108    transform.yield
109  }
110}
111
112
113// -----
114
115!tt = tensor<20xf16>
116
117// CHECK-LABEL: func @copy_1d_20xf16
118func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt {
119  /// Too little data for all threads, needs predication, while keeping most
120  /// minor transfer size -> 5 threads.
121  // CHECK: scf.forall {{.*}} in (5) {{.*}}
122  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
123  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
124  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
125  return %0 : !tt
126}
127
128module attributes {transform.with_named_sequence} {
129  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
130    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
131      : (!transform.any_op) -> !transform.any_op
132    transform.structured.gpu.map_copy_to_threads %0
133      total_num_threads = 32 desired_bit_alignment = 128
134        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
135    transform.yield
136  }
137}
138
139// -----
140
141!tt = tensor<128xf16>
142
143// CHECK-LABEL: func @copy_1d_128xf16
144func.func @copy_1d_128xf16(%t0: !tt, %out: !tt) -> !tt {
145  /// Enough data for all threads and no need for predication but we must reduce
146  /// the transfer size to 4xf16.
147  // CHECK: scf.forall {{.*}} in (32) {{.*}}
148  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
149  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
150  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
151  return %0 : !tt
152}
153
154module attributes {transform.with_named_sequence} {
155  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
156    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
157      : (!transform.any_op) -> !transform.any_op
158    transform.structured.gpu.map_copy_to_threads %0
159      total_num_threads = 32 desired_bit_alignment = 128
160        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
161    transform.yield
162  }
163}
164
165// -----
166
167!tt = tensor<256xf16>
168
169// CHECK-LABEL: func @copy_1d_256xf16
170func.func @copy_1d_256xf16(%t0: !tt, %out: !tt) -> !tt {
171  /// Enough data for all threads and no need for predication.
172  // CHECK: scf.forall {{.*}} in (32) {{.*}}
173  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
174  // CHECK: {mapping = [#gpu.thread<linear_dim_0>]}
175  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
176  return %0 : !tt
177}
178
179module attributes {transform.with_named_sequence} {
180  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
181    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
182      : (!transform.any_op) -> !transform.any_op
183    transform.structured.gpu.map_copy_to_threads %0
184      total_num_threads = 32 desired_bit_alignment = 128
185        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
186    transform.yield
187  }
188}
189
190// -----
191
192!tt = tensor<16x32x64xi8>
193
194// CHECK-LABEL: func @copy_3d_16x32x64xi8
195func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
196  // CHECK: scf.forall {{.*}} in (1, 8, 4) {{.*}}
197  // CHECK:   linalg.copy {{.*}} -> tensor<16x4x16xi8>
198  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
199  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
200  return %0 : !tt
201}
202
203module attributes {transform.with_named_sequence} {
204  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
205    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
206      : (!transform.any_op) -> !transform.any_op
207    transform.structured.gpu.map_copy_to_threads %0
208      total_num_threads = 32 desired_bit_alignment = 128
209        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
210    transform.yield
211  }
212}
213
214// -----
215
216!tt = tensor<16x32x64xi8>
217
218// CHECK-LABEL: func @copy_3d_16x32x64xi8
219func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
220  // CHECK: scf.forall {{.*}} in (1, 4, 8) {{.*}}
221  // CHECK:   linalg.copy {{.*}} -> tensor<16x8x8xi8>
222  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
223  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
224  return %0 : !tt
225}
226
227module attributes {transform.with_named_sequence} {
228  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
229    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
230      : (!transform.any_op) -> !transform.any_op
231    transform.structured.gpu.map_copy_to_threads %0
232      total_num_threads = 32 desired_bit_alignment = 64
233        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
234    transform.yield
235  }
236}
237
238// -----
239
240!tt = tensor<4x8x16xi8>
241
242// CHECK-LABEL: func @copy_3d_4x8x16xi8
243func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt {
244  // CHECK: scf.forall {{.*}} in (4, 8, 1) {{.*}}
245  // CHECK:   linalg.copy {{.*}} -> tensor<1x1x16xi8>
246  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
247  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
248  return %0 : !tt
249}
250
251module attributes {transform.with_named_sequence} {
252  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
253    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
254      : (!transform.any_op) -> !transform.any_op
255    transform.structured.gpu.map_copy_to_threads %0
256      total_num_threads = 32 desired_bit_alignment = 128
257        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
258    transform.yield
259  }
260}
261
262// -----
263
264!tt = tensor<4x8x16xi8>
265
266// CHECK-LABEL: func @copy_3d_4x8x16xi8
267func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt {
268  // CHECK: scf.forall {{.*}} in (1, 2, 16) {{.*}}
269  // CHECK:   linalg.copy {{.*}} -> tensor<4x4x1xi8>
270  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
271  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
272  return %0 : !tt
273}
274
275module attributes {transform.with_named_sequence} {
276  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
277    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
278      : (!transform.any_op) -> !transform.any_op
279    transform.structured.gpu.map_copy_to_threads %0
280      total_num_threads = 32 desired_bit_alignment = 8
281        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
282    transform.yield
283  }
284}
285
286// -----
287
288!tt = tensor<3x5x7xi8>
289
290// CHECK-LABEL: func @copy_3d_3x5x7xi8
291func.func @copy_3d_3x5x7xi8(%t0: !tt, %out: !tt) -> !tt {
292  // Best effort greedy mapping: first 7, then skip 5 (as 7*5 overflows 32), then
293  // take 3.
294  // DP mapping: 7 mandated most minor, then skip 5  (as 7*5 overflows 32), then
295  // take 3.
296  // CHECK: scf.forall {{.*}} in (3, 1, 7) {{.*}}
297  // CHECK:   linalg.copy {{.*}} -> tensor<1x5x1xi8>
298  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
299  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
300  return %0 : !tt
301}
302
303module attributes {transform.with_named_sequence} {
304  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
305    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
306      : (!transform.any_op) -> !transform.any_op
307    transform.structured.gpu.map_copy_to_threads %0
308      total_num_threads = 32 desired_bit_alignment = 8
309        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
310    transform.yield
311  }
312}
313
314// -----
315
316!tt = tensor<16x15x5xi8>
317
318// CHECK-LABEL: func @copy_3d_16x15x5xi8
319func.func @copy_3d_16x15x5xi8(%t0: !tt, %out: !tt) -> !tt {
320  // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost.
321  // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}}
322  // CHECK:   linalg.copy {{.*}} -> tensor<2x5x1xi8>
323  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
324  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
325  return %0 : !tt
326}
327
328module attributes {transform.with_named_sequence} {
329  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
330    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
331      : (!transform.any_op) -> !transform.any_op
332    transform.structured.gpu.map_copy_to_threads %0
333      total_num_threads = 128 desired_bit_alignment = 8
334        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
335    transform.yield
336  }
337}
338
339// -----
340
341!tt = tensor<16x15x40xi8>
342
343// CHECK-LABEL: func @copy_3d_16x15x40xi8
344func.func @copy_3d_16x15x40xi8(%t0: !tt, %out: !tt) -> !tt {
345  // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost.
346  // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}}
347  // CHECK:   linalg.copy {{.*}} -> tensor<2x5x8xi8>
348  // CHECK: {mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
349  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
350  return %0 : !tt
351}
352
353module attributes {transform.with_named_sequence} {
354  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
355    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
356      : (!transform.any_op) -> !transform.any_op
357    transform.structured.gpu.map_copy_to_threads %0
358      total_num_threads = 128 desired_bit_alignment = 64
359        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
360    transform.yield
361  }
362}
363
364
365////////////////////////////////////////////////////////////////////////////////
366// Tests below are expected to fail.
367////////////////////////////////////////////////////////////////////////////////
368
369// -----
370
371!tt = tensor<1024xf16>
372
373// NO-CHECK-LABEL-ON-EXPECTED-ERROR
374func.func @copy_1d_1024xf16(%t0: !tt, %out: !tt) -> !tt {
375  /// Too much data for all threads, we do not try to recover here, this is the
376  /// job of higher-level transformations to select better tile sizes and number
377  /// of threads.
378
379  // expected-note @below {{target op}}
380  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
381  return %0 : !tt
382}
383
384module attributes {transform.with_named_sequence} {
385  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
386    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
387      : (!transform.any_op) -> !transform.any_op
388    // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
389    transform.structured.gpu.map_copy_to_threads %0
390      total_num_threads = 32 desired_bit_alignment = 128
391        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
392    transform.yield
393  }
394}
395
396// -----
397
398!tt = tensor<257xf16>
399
400// NO-CHECK-LABEL-ON-EXPECTED-ERROR
401func.func @copy_1d_257xf16(%t0: !tt, %out: !tt) -> !tt {
402  /// Too much data for all threads, we do not try to recover here, this is the
403  /// job of higher-level transformations to select better tile sizes and number
404  /// of threads.
405
406  // expected-note @below {{target op}}
407  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
408  return %0 : !tt
409}
410
411module attributes {transform.with_named_sequence} {
412  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
413    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
414      : (!transform.any_op) -> !transform.any_op
415    // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
416    transform.structured.gpu.map_copy_to_threads %0
417      total_num_threads = 32 desired_bit_alignment = 128
418        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
419    transform.yield
420  }
421}
422
423// -----
424
425!tt = tensor<512xi8>
426
427// NO-CHECK-LABEL-ON-EXPECTED-ERROR
428func.func @copy_1d_512xi8(%t0: !tt, %out: !tt) -> !tt {
429  /// Too much data for all threads given the forced alignment to 8b,
430  /// we do not try to recover here, this is the job of higher-level
431  /// transformations to select better tile sizes and number of threads.
432  // expected-note @below {{target op}}
433  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
434  return %0 : !tt
435}
436
437module attributes {transform.with_named_sequence} {
438  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
439    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
440      : (!transform.any_op) -> !transform.any_op
441    // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
442    transform.structured.gpu.map_copy_to_threads %0
443      total_num_threads = 32 desired_bit_alignment = 8
444        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
445    transform.yield
446  }
447}
448
449// -----
450
451!tt = tensor<16x32x64xi8>
452
453// NO-CHECK-LABEL-ON-EXPECTED-ERROR
454func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
455  /// Too much data for all threads given the forced alignment to 8b,
456  /// we do not try to recover here, this is the job of higher-level
457  /// transformations to select better tile sizes and number of threads.
458  // expected-note @below {{target op}}
459  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt
460  return %0 : !tt
461}
462
463module attributes {transform.with_named_sequence} {
464  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
465    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1
466      : (!transform.any_op) -> !transform.any_op
467    // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
468    transform.structured.gpu.map_copy_to_threads %0
469      total_num_threads = 32 desired_bit_alignment = 8
470        : (!transform.any_op) -> (!transform.op<"scf.forall">, !transform.op<"linalg.copy">)
471    transform.yield
472  }
473}
474