xref: /llvm-project/mlir/test/Dialect/Linalg/tile-to-forall.mlir (revision 6740d701bde4ad9b95d7d811852fa0a2542e6b28)
1// RUN: mlir-opt %s --transform-interpreter -canonicalize -cse -split-input-file -verify-diagnostics | FileCheck %s
2
3// Offset per thread:
4// CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 10))>
5// Per thread tile size.
6// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 10)) + s0, s0 ceildiv 10)>
7// CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 20))>
8// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 20)) + s0, s0 ceildiv 20)>
9
10module {
11// CHECK-LABEL: matmul(
12//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
13//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
14//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
15  func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
16  //      CHECK: scf.forall ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
17  //      CHECK:   %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
18  //      CHECK:   %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
19  //      CHECK:   %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
20  //      CHECK:   %[[RES:.*]] = linalg.matmul
21  // CHECK-SAME:      ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>)
22  // CHECK-SAME:     outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
23  //      CHECK:   scf.forall.in_parallel {
24  // CHECK-NEXT:     tensor.parallel_insert_slice %[[RES]] into %[[C_BLK]]{{.*}} :
25  // CHECK-SAME:       tensor<?x?xf32> into tensor<?x?xf32>
26  // CHECK-NEXT:   }
27  // CHECK-NEXT: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
28    %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
29                      outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
30    return %0 : tensor<?x?xf32>
31  }
32
33  module attributes {transform.with_named_sequence} {
34    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
35      %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
36      %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
37           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
38     transform.yield
39    }
40  }
41}
42
43// -----
44
45module {
46  // CHECK-LABEL: func @matmul_memref(
47  //       CHECK:   scf.forall (%{{.*}}, %{{.*}}) in (10, 20) {
48  //       CHECK:     memref.subview
49  //       CHECK:     memref.subview
50  //       CHECK:     memref.subview
51  //       CHECK:     linalg.matmul
52  //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
53  func.func @matmul_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
54    linalg.matmul ins(%A, %B : memref<?x?xf32>, memref<?x?xf32>)
55                  outs(%C : memref<?x?xf32>)
56    return
57  }
58
59  module attributes {transform.with_named_sequence} {
60    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
61      %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
62      %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
63           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
64      transform.yield
65    }
66  }
67}
68
69// -----
70
71module {
72  // CHECK-LABEL: func @copy_memref(
73  //       CHECK:   scf.forall (%{{.*}}, %{{.*}}) in (10, 20) {
74  //       CHECK:     memref.subview
75  //       CHECK:     memref.subview
76  //       CHECK:     linalg.copy
77  //       CHECK:   } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
78  func.func @copy_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>) {
79    linalg.copy ins(%A: memref<?x?xf32>)
80                outs(%B : memref<?x?xf32>)
81    return
82  }
83
84  module attributes {transform.with_named_sequence} {
85    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
86      %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
87      %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
88           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
89      transform.yield
90    }
91  }
92}
93
94// -----
95
96// In this test case, matmul dims and tile size are dynamic.
97
98// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
99// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>
100// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
101
102// CHECK-LABEL: matmul_tile_size_dynamic_dynamic(
103//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
104//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
105//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
106func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
107  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
108  //  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
109  //  CHECK-DAG: %[[tile_size_1:.*]] = "test.dummy"()
110  //  CHECK-DAG: %[[tile_size_2:.*]] = "test.dummy"()
111  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
112  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
113  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]]
114  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]]
115  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
116  //      CHECK:   tensor.extract_slice %[[A]]
117  //      CHECK:   tensor.extract_slice %[[B]]
118  //      CHECK:   tensor.extract_slice %[[C_BLK]]
119  //      CHECK:   linalg.matmul
120  //      CHECK:   scf.forall.in_parallel
121  // CHECK-NEXT:    tensor.parallel_insert_slice
122  %tile_size_1 = "test.dummy"() : () -> (index)
123  %tile_size_2 = "test.dummy"() : () -> (index)
124  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
125                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
126  return %0 : tensor<?x?xf32>
127}
128
129module attributes {transform.with_named_sequence} {
130  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
131    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
132    %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
133    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
134           : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
135    transform.yield
136  }
137}
138
139// -----
140
141// Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot.
142
143// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 300, 15)>
144// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)>
145// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)>
146// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 15)>
147
148// CHECK-LABEL: matmul_static(
149//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor
150//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
151//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
152func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
153  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]])
154  //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]])
155  //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
156  //  CHECK-NOT:   affine.min
157  //  CHECK-NOT:   affine.max
158  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
159  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
160  //      CHECK:   %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
161  //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
162  //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
163  //      CHECK:   linalg.matmul
164  //      CHECK:   scf.forall.in_parallel
165  // CHECK-NEXT:    tensor.parallel_insert_slice
166  %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
167                    outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
168  return %0 : tensor<100x300xf32>
169}
170
171module attributes {transform.with_named_sequence} {
172  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
173    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
174    %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 21]
175           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
176    transform.yield
177  }
178}
179
180// -----
181
182// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)>
183// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)>
184// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)>
185// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>
186// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)>
187// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)>
188
189// CHECK-LABEL: matmul_tile_size_dynamic(
190//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
191//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
192//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
193func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
194  //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 :
195  //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
196  //      CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]]
197  //      CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
198  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
199  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
200  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
201  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
202  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
203  //      CHECK:   tensor.extract_slice %[[A]]
204  //      CHECK:   tensor.extract_slice %[[B]]
205  //      CHECK:   tensor.extract_slice %[[C_BLK]]
206  //      CHECK:   linalg.matmul
207  //      CHECK:   scf.forall.in_parallel
208  // CHECK-NEXT:    tensor.parallel_insert_slice
209  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
210                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
211  return %0 : tensor<?x?xf32>
212}
213
214module attributes {transform.with_named_sequence} {
215  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
216    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
217    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [10, 20]
218           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
219    transform.yield
220  }
221}
222// -----
223
224// Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot.
225
226// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)>
227// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)>
228// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 21)>
229
230// CHECK-LABEL: matmul_tile_size_static(
231//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor
232//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
233//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
234func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
235  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
236  //  CHECK-DAG:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
237  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
238  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
239  //  CHECK-NOT:   affine.max
240  //  CHECK-NOT:   affine.min
241  //      CHECK:   %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
242  //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
243  //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
244  //      CHECK:   linalg.matmul
245  //      CHECK:   scf.forall.in_parallel
246  // CHECK-NEXT:    tensor.parallel_insert_slice
247  %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
248                    outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
249  return %0 : tensor<100x300xf32>
250}
251
252module attributes {transform.with_named_sequence} {
253  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
254    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
255    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [10, 21]
256           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
257    transform.yield
258  }
259}
260
261// -----
262
263module {
264  func.func @extract_source(%A: tensor<4xf32>, %B: tensor<16xf32>) -> tensor<4xf32> {
265    %B1 = tensor.extract_slice %B[10] [4] [1] : tensor<16xf32> to tensor<4xf32>
266    %result = linalg.generic {indexing_maps = [
267      affine_map<(d0) -> (d0)>,affine_map<(d0) -> (d0)>],
268      iterator_types = ["parallel"]}
269      ins(%A : tensor<4xf32>) outs(%B1 : tensor<4xf32>) {
270      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
271        %2 = arith.addf %arg3, %arg3 : f32
272        linalg.yield %2 : f32
273    } -> tensor<4xf32>
274    return %result : tensor<4xf32>
275  }
276
277  module attributes {transform.with_named_sequence} {
278    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
279      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
280      %1:2 = transform.structured.tile_using_forall %0 num_threads [2] ( mapping = [#gpu.thread<x>])
281           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
282      transform.yield
283    }
284  }
285}
286// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 2)>
287
288// CHECK-LABEL: extract_source(
289//       CHECK:  scf.forall (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
290//       CHECK:    %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]])
291//       CHECK:    scf.forall.in_parallel {
292//       CHECK:      tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32>
293
294// -----
295
296// In this test case, matmul dims and tile size are dynamic.
297
298// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
299// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)>
300// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>
301// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>
302// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
303// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)>
304
305// CHECK-LABEL: matmul_tile_size_dynamic_dynamic(
306//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
307//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
308//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
309func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
310  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
311  //  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
312  //  CHECK-DAG: %[[tile_size:.*]] = "test.dummy"()
313  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
314  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
315  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]]
316  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
317  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
318  //      CHECK:   tensor.extract_slice %[[A]]
319  //      CHECK:   tensor.extract_slice %[[B]]
320  //      CHECK:   tensor.extract_slice %[[C_BLK]]
321  //      CHECK:   linalg.matmul
322  //      CHECK:   scf.forall.in_parallel
323  // CHECK-NEXT:    tensor.parallel_insert_slice
324  %tile_size = "test.dummy"() : () -> (index)
325  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
326                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
327  return %0 : tensor<?x?xf32>
328}
329
330module attributes {transform.with_named_sequence} {
331  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
332    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
333    %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
334    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
335           : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
336    transform.yield
337  }
338}
339
340// -----
341
342// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)>
343// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)>
344// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)>
345
346// CHECK-LABEL: tile_output_multi_1d_static(
347//  CHECK-SAME:   %[[IN1:[0-9a-z]+]]: tensor<100xf32>
348//  CHECK-SAME:   %[[IN2:[0-9a-z]+]]: tensor<100xf32>
349//  CHECK-SAME:   %[[ORGOUT1:[0-9a-z]+]]: tensor<100xf32>
350//  CHECK-SAME:   %[[ORGOUT2:[0-9a-z]+]]: tensor<100xf32>
351  func.func @tile_output_multi_1d_static(%IN1: tensor<100xf32>, %IN2: tensor<100xf32>,
352                                         %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
353                                         -> (tensor<100xf32>, tensor<100xf32>) {
354//      CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
355//      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV0]])
356//  CHECK-NOT:   affine.min
357//  CHECK-NOT:   affine.max
358//      CHECK:   %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]])
359//      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN1]][%[[LB]]] [%[[TS]]] [1] :
360//      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN2]][%[[LB]]] [%[[TS]]] [1] :
361//      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
362//      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
363//      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
364//      CHECK:   scf.forall.in_parallel
365// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
366// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
367    %res1, %res2 = linalg.generic
368    {
369      indexing_maps = [affine_map<(d0) -> (d0)>,
370                       affine_map<(d0) -> (d0)>,
371                       affine_map<(d0) -> (d0)>,
372                       affine_map<(d0) -> (d0)>],
373      iterator_types = ["parallel"]
374    } ins(%IN1, %IN2 : tensor<100xf32>, tensor<100xf32>)
375      outs(%OUT1, %OUT2 : tensor<100xf32>, tensor<100xf32>)
376    {
377      ^bb0(%a1: f32, %a2: f32, %a3: f32, %a4: f32):
378        %1 = arith.addf %a1, %a3 : f32
379        %2 = arith.addf %a2, %a4 : f32
380        linalg.yield %1, %2 : f32,f32
381    } -> (tensor<100xf32>, tensor<100xf32>)
382    return %res1, %res2 : tensor<100xf32>, tensor<100xf32>
383  }
384
385  module attributes {transform.with_named_sequence} {
386    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
387      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
388      %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [7]
389           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
390      transform.yield
391    }
392  }
393
394// -----
395
396// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 75)>
397// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0, d1) -> (d1)>
398// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0, d1) -> (d1, d0)
399// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0, d1) -> (d0)>
400// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0, d1) -> (d0, d1)>
401
402// CHECK-LABEL: tile_output_multi_1d2d_static(
403//  CHECK-SAME:   %[[IN1:[0-9a-z]+]]: tensor<100xf32>
404//  CHECK-SAME:   %[[IN2:[0-9a-z]+]]: tensor<100x300xf32>
405//  CHECK-SAME:   %[[IN3:[0-9a-z]+]]: tensor<300xf32>
406//  CHECK-SAME:   %[[ORGOUT1:[0-9a-z]+]]: tensor<300x100xf32>
407//  CHECK-SAME:   %[[ORGOUT2:[0-9a-z]+]]: tensor<300xf32>
408  func.func @tile_output_multi_1d2d_static(%IN1: tensor<100xf32>, %IN2: tensor<100x300xf32>, %IN3: tensor<300xf32>,
409                     %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>)
410                     -> (tensor<300x100xf32>, tensor<300xf32>) {
411//      CHECK: scf.forall (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
412//      CHECK:   %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]])
413//      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75]
414//      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75]
415//      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]], 0] [75, 100]
416//      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [75]
417//      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
418//      CHECK:   scf.forall.in_parallel
419// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]], 0] [75, 100]
420// CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [75]
421    %res2, %res3 = linalg.generic {
422      indexing_maps = [affine_map<(d0,d1) -> (d1)>,
423                       affine_map<(d0,d1) -> (d1,d0)>,
424                       affine_map<(d0,d1) -> (d0)>,
425                       affine_map<(d0,d1) -> (d0,d1)>,
426                       affine_map<(d0,d1) -> (d0)>
427                       ],
428      iterator_types = ["parallel", "parallel"]
429    } ins(%IN1, %IN2, %IN3 : tensor<100xf32>, tensor<100x300xf32>, tensor<300xf32>)
430      outs(%OUT1, %OUT2: tensor<300x100xf32>, tensor<300xf32>)  {
431      ^bb0(%i1: f32, %i2: f32, %i3: f32, %o1: f32, %o2: f32):
432        %1 = arith.addf %i1, %o1 : f32
433        %2 = arith.addf %i2, %1 : f32
434        %3 = arith.addf %i3, %2 : f32
435        linalg.yield %3, %i3 : f32, f32
436    } -> (tensor<300x100xf32>, tensor<300xf32>)
437
438    return %res2, %res3 : tensor<300x100xf32>, tensor<300xf32>
439  }
440
441  module attributes {transform.with_named_sequence} {
442    transform.named_sequence @__transform_main(%IN_MAT2: !transform.any_op {transform.readonly}) {
443      %0 = transform.structured.match ops{["linalg.generic"]} in %IN_MAT2 : (!transform.any_op) -> !transform.any_op
444      %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [4]
445           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
446      transform.yield
447    }
448  }
449
450// -----
451
452// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)>
453// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)>
454// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)>
455// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>
456// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)>
457// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)>
458
459// CHECK-LABEL: matmul_tile_size_dynamic(
460//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
461//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
462//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
463func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
464  //      CHECK: %[[c1:.*]] = arith.constant 1 : index
465  //      CHECK: %[[c0:.*]] = arith.constant 0 : index
466  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
467  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
468  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
469  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
470  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
471  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
472  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
473  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
474  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
475  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
476  //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
477  //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
478  //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
479  //      CHECK:   linalg.matmul
480  //      CHECK:   scf.forall.in_parallel
481  // CHECK-NEXT:    tensor.parallel_insert_slice
482  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
483                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
484  return %0 : tensor<?x?xf32>
485}
486
487module attributes {transform.with_named_sequence} {
488  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
489    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
490    %sz = transform.param.constant 10 : i64 -> !transform.param<i64>
491    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
492           : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
493    transform.yield
494  }
495}
496
497// -----
498
499func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
500  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
501                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
502  return %0 : tensor<?x?xf32>
503}
504
505module attributes {transform.with_named_sequence} {
506  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
507    %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg1 : (!transform.any_op) -> !transform.any_op
508    %c10 = transform.param.constant 10 : i64 -> !transform.param<i64>
509    %c20 = transform.param.constant 20 : i64 -> !transform.param<i64>
510    %sz = transform.merge_handles %c10, %c20 : !transform.param<i64>
511    // expected-error @below {{requires exactly one parameter associated}}
512    %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20]
513           : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
514    transform.yield
515  }
516}
517
518// -----
519
520// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)>
521// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)>
522// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)>
523// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>
524// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)>
525// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)>
526
527// CHECK-LABEL: matmul_tile_size_dynamic(
528//  CHECK-SAME:   %[[A:[0-9a-z]+]]: tensor<?x?xf32>
529//  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
530//  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
531func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
532  //      CHECK: %[[c1:.*]] = arith.constant 1 : index
533  //      CHECK: %[[c0:.*]] = arith.constant 0 : index
534  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
535  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
536  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
537  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
538  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
539  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
540  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
541  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
542  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
543  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
544  //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
545  //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
546  //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
547  //      CHECK:   linalg.matmul
548  //      CHECK:   scf.forall.in_parallel
549  // CHECK-NEXT:    tensor.parallel_insert_slice
550  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
551                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
552  return %0 : tensor<?x?xf32>
553}
554
555module attributes {transform.with_named_sequence} {
556  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
557    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
558    %c10 = transform.param.constant 10 : i64 -> !transform.any_param
559    %c20 = transform.param.constant 20 : i64 -> !transform.any_param
560    %sz = transform.merge_handles %c10, %c20 : !transform.any_param
561    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
562           : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op)
563    transform.yield
564  }
565}
566
567// -----
568
569func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
570  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
571                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
572  return %0 : tensor<?x?xf32>
573}
574
575module attributes {transform.with_named_sequence} {
576  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
577    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
578    %sz = transform.param.constant "[10 : i64, 20 : i64]" -> !transform.any_param
579    // expected-error @below {{expected the parameter to be associated with an integer attribute}}
580    %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz)
581           : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op)
582    transform.yield
583  }
584}
585
586// -----
587
588#map = affine_map<(d0, d1) -> (d0, d1)>
589#map1 = affine_map<(d0, d1) -> (d0)>
590
591func.func @tile_thread_safety1(%arg0: tensor<100x300xf32>, %arg1: tensor<100xf32>) -> tensor<100xf32> {
592  // expected-warning@below {{tiling is not thread safe at axis #1}}
593  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<100x300xf32>) outs(%arg1 : tensor<100xf32>) {
594  ^bb0(%in: f32, %out: f32):
595    %1 = arith.addf %in, %out : f32
596    linalg.yield %1 : f32
597  } -> tensor<100xf32>
598  return %0 : tensor<100xf32>
599}
600
601module attributes {transform.with_named_sequence} {
602  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
603    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
604    %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [4, 2]
605          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
606    transform.yield
607  }
608}
609
610// -----
611
612#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
613#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
614
615func.func @tile_thread_safety2(%arg0: tensor<100x300x8xf32>, %arg1: tensor<300x8xf32>) -> tensor<300x8xf32> {
616  // expected-warning@below {{tiling is not thread safe at axis #0}}
617  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction", "parallel", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<300x8xf32>) {
618  ^bb0(%in: f32, %out: f32):
619    %1 = arith.addf %in, %out : f32
620    linalg.yield %1 : f32
621  } -> tensor<300x8xf32>
622  return %0 : tensor<300x8xf32>
623}
624
625module attributes {transform.with_named_sequence} {
626  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
627    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
628    %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8]
629          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
630    transform.yield
631  }
632}
633
634// -----
635
636#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
637#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
638
639func.func @tile_thread_safety3(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>) -> tensor<100x8xf32> {
640  // expected-warning@below {{tiling is not thread safe at axis #1}}
641  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<100x8xf32>) {
642  ^bb0(%in: f32, %out: f32):
643    %1 = arith.addf %in, %out : f32
644    linalg.yield %1 : f32
645  } -> tensor<100x8xf32>
646  return %0 : tensor<100x8xf32>
647}
648
649module attributes {transform.with_named_sequence} {
650  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
651    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
652    %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2]
653          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
654    transform.yield
655  }
656}
657
658// -----
659
660#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
661#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
662#map2 = affine_map<(d0, d1, d2) -> (d2)>
663
664func.func @tile_thread_safety4(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>, %arg2 : tensor<8xf32>) -> (tensor<100x8xf32>, tensor<8xf32>) {
665  // expected-warning@+2 {{tiling is not thread safe at axis #0}}
666  // expected-warning@below {{tiling is not thread safe at axis #1}}
667  %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1, %arg2 : tensor<100x8xf32>, tensor<8xf32>) {
668  ^bb0(%in: f32, %out1: f32, %out2: f32):
669    %1 = arith.addf %in, %out1 : f32
670    %2 = arith.addf %in, %out2 : f32
671    linalg.yield %1, %2 : f32, f32
672  } -> (tensor<100x8xf32>, tensor<8xf32>)
673  return %0#0, %0#1 : tensor<100x8xf32>, tensor<8xf32>
674}
675
676module attributes {transform.with_named_sequence} {
677  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
678    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
679    %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2]
680          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
681    transform.yield
682  }
683}
684
685// -----
686
687#map = affine_map<(d0, d1) -> (d0, d1)>
688#map1 = affine_map<(d0, d1) -> (d0)>
689
690func.func @tile_thread_safety5(%arg0: tensor<100x300xf32>, %arg1: tensor<100xf32>) -> tensor<100xf32> {
691  // expected-warning@below {{tiling is not thread safe at axis #1}}
692  %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<100x300xf32>) outs(%arg1 : tensor<100xf32>) {
693  ^bb0(%in: f32, %out: f32):
694    %1 = arith.addf %in, %out : f32
695    linalg.yield %1 : f32
696  } -> tensor<100xf32>
697  return %0 : tensor<100xf32>
698}
699
700module attributes {transform.with_named_sequence} {
701  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
702    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
703    %forall, %tiled_generic = transform.structured.tile_using_forall %0 tile_sizes [10, 1]
704          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
705    transform.yield
706  }
707}
708
709// -----
710
711func.func @tile_thread_safety6(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
712  // expected-warning@below {{tiling is not thread safe at axis #2}}
713  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
714                    outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
715  return %0 : tensor<?x?xf32>
716}
717
718module attributes {transform.with_named_sequence} {
719  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
720    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
721    %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [2, 0, 8]
722          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
723    transform.yield
724  }
725}
726