xref: /llvm-project/mlir/test/Dialect/SCF/transform-loop-fuse-sibling.mlir (revision 97a2bd8415dc6792b99ec0f091ad7570673c3f37)
1// RUN: mlir-opt %s -transform-interpreter --cse --canonicalize -split-input-file -verify-diagnostics | FileCheck %s
2// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s --check-prefix CHECK-NOCLEANUP
3
4// CHECK: func.func @fuse_1st_for_into_2nd([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
5func.func @fuse_1st_for_into_2nd(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
6  // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
7  // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index
8  // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index
9  // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32
10  %c0 = arith.constant 0 : index
11  %c16 = arith.constant 16 : index
12  %c128 = arith.constant 128 : index
13  %cst = arith.constant 0.000000e+00 : f32
14  // CHECK: [[R0:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IA:%.*]] = [[A]], [[IB:%.*]] = [[B]]) {{.*}}
15  %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) {
16  // CHECK-DAG:   [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]]
17  // CHECK-DAG:   [[SLICE0:%.*]] = vector.transfer_read [[IA]][[[IV]]], [[ZERO]]
18  // CHECK:       [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]]
19  // CHECK-NEXT:  [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IA]][[[IV]]]
20    %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
21    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
22    %5 = arith.addf %3, %2 : vector<16xf32>
23    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
24    scf.yield %6 : tensor<128xf32>
25  }
26  %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) {
27  // CHECK-DAG:   [[SLICE1:%.*]] = vector.transfer_read [[IB]][[[IV]]], [[ZERO]]
28  // CHECK:       [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]]
29  // CHECK-NEXT:  [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB]][[[IV]]]
30    %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
31    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
32    %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
33    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
34  // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}}
35    scf.yield %dup6 : tensor<128xf32>
36  }
37  return %1, %dup1 : tensor<128xf32>, tensor<128xf32>
38}
39module attributes {transform.with_named_sequence} {
40  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
41    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
42    %for:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
43    %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
44    transform.yield
45  }
46}
47
48// -----
49
50// CHECK: func.func @fuse_2nd_for_into_1st([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
51func.func @fuse_2nd_for_into_1st(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
52  // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
53  // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index
54  // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index
55  // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32
56  %c0 = arith.constant 0 : index
57  %c16 = arith.constant 16 : index
58  %c128 = arith.constant 128 : index
59  %cst = arith.constant 0.000000e+00 : f32
60  // CHECK: [[R0:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IB:%.*]] = [[B]], [[IA:%.*]] = [[A]]) {{.*}}
61  %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) {
62  // CHECK-DAG:   [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]]
63  // CHECK-DAG:   [[SLICE0:%.*]] = vector.transfer_read [[IB]][[[IV]]], [[ZERO]]
64  // CHECK:       [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]]
65  // CHECK-NEXT:  [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB]][[[IV]]]
66    %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
67    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
68    %5 = arith.addf %3, %2 : vector<16xf32>
69    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
70    scf.yield %6 : tensor<128xf32>
71  }
72  %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) {
73  // CHECK-DAG:   [[SLICE1:%.*]] = vector.transfer_read [[IA]][[[IV]]], [[ZERO]]
74  // CHECK:       [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]]
75  // CHECK-NEXT:  [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IA]][[[IV]]]
76    %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
77  // NB: the dominance check used to fail on the following line,
78  // however the defining op for the value of %arg3 occurs above the source loop and hence is safe
79  // and %arg4 is a block argument of the scope of the loops and hence is safe
80    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
81    %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
82    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
83  // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}}
84    scf.yield %dup6 : tensor<128xf32>
85  }
86  return %1, %dup1 : tensor<128xf32>, tensor<128xf32>
87}
88module attributes {transform.with_named_sequence} {
89  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
90    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
91    %for:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
92    %fused = transform.loop.fuse_sibling %for#1 into %for#0 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
93    transform.yield
94  }
95}
96
97// -----
98
99// CHECK: func.func @matmul_fuse_1st_forall_into_2nd([[A1:%.*]]: {{.*}}, [[A2:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
100func.func @matmul_fuse_1st_forall_into_2nd(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
101  %zero = arith.constant 0.0 : f32
102  %out_alloc = tensor.empty() : tensor<128x128xf32>
103  %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32>
104
105  // CHECK: scf.forall ([[I:%.*]]) in (4) shared_outs([[S1:%.*]] = [[IN1:%.*]], [[S2:%.*]] = [[IN2:%.*]]) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
106  // CHECK:   [[T:%.*]] = affine.apply
107  // CHECK:   tensor.extract_slice [[A2]][[[T]], 0] [32, 128] [1, 1]
108  // CHECK:   tensor.extract_slice [[S1]][[[T]], 0] [32, 128] [1, 1]
109  // CHECK:   [[OUT1:%.*]] = linalg.matmul
110  // CHECK:   tensor.extract_slice [[A1]][[[T]], 0] [32, 128] [1, 1]
111  // CHECK:   tensor.extract_slice [[S2]][[[T]], 0] [32, 128] [1, 1]
112  // CHECK:   [[OUT2:%.*]] = linalg.matmul
113  // CHECK:   scf.forall.in_parallel {
114  // CHECK:     tensor.parallel_insert_slice [[OUT1]] into [[S1]][[[T]], 0] [32, 128] [1, 1]
115  // CHECK:     tensor.parallel_insert_slice [[OUT2]] into [[S2]][[[T]], 0] [32, 128] [1, 1]
116  // CHECK:   }
117  // CHECK: }
118  %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
119  %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
120
121  func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32>
122}
123module attributes {transform.with_named_sequence} {
124  transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) {
125    %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op)
126
127    %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
128
129    %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
130    %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
131
132    %fused_loop = transform.loop.fuse_sibling %loop2 into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op
133    transform.yield
134  }
135}
136
137// -----
138
139// CHECK: func.func @matmul_fuse_2nd_forall_into_1st([[A1:%.*]]: {{.*}}, [[A2:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
140func.func @matmul_fuse_2nd_forall_into_1st(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
141  %zero = arith.constant 0.0 : f32
142  %out_alloc = tensor.empty() : tensor<128x128xf32>
143  %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32>
144
145  // CHECK: scf.forall ([[I:%.*]]) in (4) shared_outs([[S1:%.*]] = [[IN1:%.*]], [[S2:%.*]] = [[IN2:%.*]]) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
146  // CHECK:   [[T:%.*]] = affine.apply
147  // CHECK:   tensor.extract_slice [[A1]][[[T]], 0] [32, 128] [1, 1]
148  // CHECK:   tensor.extract_slice [[S1]][[[T]], 0] [32, 128] [1, 1]
149  // CHECK:   [[OUT1:%.*]] = linalg.matmul
150  // CHECK:   tensor.extract_slice [[A2]][[[T]], 0] [32, 128] [1, 1]
151  // CHECK:   tensor.extract_slice [[S2]][[[T]], 0] [32, 128] [1, 1]
152  // CHECK:   [[OUT2:%.*]] = linalg.matmul
153  // CHECK:   scf.forall.in_parallel {
154  // CHECK:     tensor.parallel_insert_slice [[OUT1]] into [[S1]][[[T]], 0] [32, 128] [1, 1]
155  // CHECK:     tensor.parallel_insert_slice [[OUT2]] into [[S2]][[[T]], 0] [32, 128] [1, 1]
156  // CHECK:   }
157  // CHECK: }
158  %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
159  %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
160
161  func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32>
162}
163module attributes {transform.with_named_sequence} {
164  transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) {
165    %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op)
166
167    %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
168
169    %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
170    %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
171
172    %fused_loop = transform.loop.fuse_sibling %loop1 into %loop2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
173    transform.yield
174  }
175}
176
177// -----
178
179// CHECK-NOCLEANUP: func.func @fuse_no_iter_args([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
180func.func @fuse_no_iter_args(%A: tensor<128xf32>, %B: tensor<128xf32>) {
181  // CHECK-NOCLEANUP: [[C0:%.*]] = arith.constant 0 : index
182  // CHECK-NOCLEANUP: [[C16:%.*]] = arith.constant 16 : index
183  // CHECK-NOCLEANUP: [[C128:%.*]] = arith.constant 128 : index
184  // CHECK-NOCLEANUP: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32
185  %c0 = arith.constant 0 : index
186  %c16 = arith.constant 16 : index
187  %c128 = arith.constant 128 : index
188  %cst = arith.constant 0.000000e+00 : f32
189  // CHECK-NOCLEANUP: scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] {{.*}}
190  scf.for %arg0 = %c0 to %c128 step %c16 {
191  // CHECK-NOCLEANUP:   [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]]
192    %2 = vector.transfer_read %A[%arg0], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
193    scf.yield
194  }
195  scf.for %arg0 = %c0 to %c128 step %c16 {
196  // CHECK-NOCLEANUP:   [[BSLICE:%.*]] = vector.transfer_read [[B]][[[IV]]], [[ZERO]]
197    %dup2 = vector.transfer_read %B[%arg0], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
198    scf.yield
199  }
200  return
201}
202module attributes {transform.with_named_sequence} {
203  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
204    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
205    %for:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
206    %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
207    transform.yield
208  }
209}
210
211// -----
212
213func.func @source_for_uses_result_of_target_for_err(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
214  %c0 = arith.constant 0 : index
215  %c16 = arith.constant 16 : index
216  %c128 = arith.constant 128 : index
217  %cst = arith.constant 0.000000e+00 : f32
218  // expected-error @below {{user of results of target should be properly dominated by source}}
219  %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) {
220    %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
221    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
222    %5 = arith.addf %3, %2 : vector<16xf32>
223    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
224    scf.yield %6 : tensor<128xf32>
225  }
226  %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %1) -> (tensor<128xf32>) {
227    %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
228    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
229    %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
230    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
231    scf.yield %dup6 : tensor<128xf32>
232  }
233  return %1, %dup1 : tensor<128xf32>, tensor<128xf32>
234}
235module attributes {transform.with_named_sequence} {
236  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
237    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
238    %for:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
239    %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
240    transform.yield
241  }
242}
243
244// -----
245
246func.func @source_forall_uses_result_of_target_forall_err(%A : tensor<128x128xf32>, %B1 : tensor<128x128xf32>, %B2 : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
247  %zero = arith.constant 0.0 : f32
248  %out_alloc = tensor.empty() : tensor<128x128xf32>
249  %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32>
250
251  // expected-error @below {{user of results of target should be properly dominated by source}}
252  %out1 = linalg.matmul ins(%A, %B1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
253  %out2 = linalg.matmul ins(%A, %out1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32>
254
255  func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32>
256}
257module attributes {transform.with_named_sequence} {
258  transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) {
259    %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op)
260
261    %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
262
263    %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
264    %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
265
266    %fused_loop = transform.loop.fuse_sibling %loop1 into %loop2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
267    transform.yield
268  }
269}
270
271// -----
272
273func.func @target_for_region_uses_result_of_source_for_err(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) {
274  %c0 = arith.constant 0 : index
275  %c16 = arith.constant 16 : index
276  %c128 = arith.constant 128 : index
277  %cst = arith.constant 0.000000e+00 : f32
278  %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) {
279    %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
280    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
281    %5 = arith.addf %3, %2 : vector<16xf32>
282    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
283    scf.yield %6 : tensor<128xf32>
284  }
285  %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) {
286  // expected-error @below {{values used inside regions of target should be properly dominated by source}}
287    %dup2 = vector.transfer_read %1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
288    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
289    %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
290    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
291    scf.yield %dup6 : tensor<128xf32>
292  }
293  return %1, %dup1 : tensor<128xf32>, tensor<128xf32>
294}
295module attributes {transform.with_named_sequence} {
296  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
297    %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op
298    %for:2 = transform.split_handle %0 :  (!transform.any_op) -> (!transform.any_op, !transform.any_op)
299    %fused = transform.loop.fuse_sibling %for#1 into %for#0 : (!transform.any_op,!transform.any_op) ->  !transform.any_op
300    transform.yield
301  }
302}
303
304// -----
305
306func.func @target_forall_depends_on_value_not_dominated_by_source_forall_err(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) {
307  %zero = arith.constant 0.0 : f32
308  %buf1_alloc = tensor.empty() : tensor<128x128xf32>
309  %buf1 = linalg.fill ins(%zero : f32) outs(%buf1_alloc : tensor<128x128xf32>) -> tensor<128x128xf32>
310  %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%buf1 : tensor<128x128xf32>) -> tensor<128x128xf32>
311  %out_alloc2 = tensor.empty() : tensor<128x128xf32>
312  %buf2 = linalg.fill ins(%zero : f32) outs(%buf1_alloc : tensor<128x128xf32>) -> tensor<128x128xf32>
313  // expected-error @below {{operands of target should be properly dominated by source}}
314  %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%buf2 : tensor<128x128xf32>) -> tensor<128x128xf32>
315
316  func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32>
317}
318module attributes {transform.with_named_sequence} {
319  transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) {
320    %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op)
321
322    %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
323
324    %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
325    %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
326
327    %fused_loop = transform.loop.fuse_sibling %loop2 into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op
328    transform.yield
329  }
330}
331// -----
332
333// CHECK: func.func @foreach_loop_pair_fuse([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}}
334func.func @foreach_loop_pair_fuse(%arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) {
335  // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
336  // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index
337  // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index
338  // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32
339  %c0 = arith.constant 0 : index
340  %c16 = arith.constant 16 : index
341  %c32 = arith.constant 32 : index
342  %c128 = arith.constant 128 : index
343  %cst = arith.constant 0.000000e+00 : f32
344  // CHECK: [[RST:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IB0:%.*]] = [[B]], [[IB1:%.*]] = [[B]]) {{.*}}
345  %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) {
346  // CHECK-DAG:   [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]]
347  // CHECK-DAG:   [[SLICE0:%.*]] = vector.transfer_read [[IB0]][[[IV]]], [[ZERO]]
348  // CHECK:       [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]]
349  // CHECK-NEXT:  [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB0]][[[IV]]]
350    %2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
351    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
352    %5 = arith.addf %3, %2 : vector<16xf32>
353    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
354    scf.yield %6 : tensor<128xf32>
355  } {target_loops}
356  %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) {
357  // CHECK-DAG:   [[SLICE1:%.*]] = vector.transfer_read [[IB1]][[[IV]]], [[ZERO]]
358  // CHECK:       [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]]
359  // CHECK-NEXT:  [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB1]][[[IV]]]
360    %dup2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
361    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32>
362    %dup5 = arith.addf %dup3, %dup2 : vector<16xf32>
363    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32>
364  // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}}
365    scf.yield %dup6 : tensor<128xf32>
366  } {source_loops}
367  %2 = scf.for %arg3 = %c0 to %c128 step %c32 iter_args(%arg4 = %arg2) -> (tensor<128xf32>)  {
368  // CHECK-DAG:   [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]]
369  // CHECK-DAG:   [[SLICE0:%.*]] = vector.transfer_read [[IB0]][[[IV]]], [[ZERO]]
370  // CHECK:       [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]]
371  // CHECK-NEXT:  [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB0]][[[IV]]]
372    %2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32>
373    %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32>
374    %5 = arith.addf %3, %2 : vector<32xf32>
375    %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<32xf32>, tensor<128xf32>
376    scf.yield %6 : tensor<128xf32>
377  } {target_loops}
378  %dup2 = scf.for %arg3 = %c0 to %c128 step %c32 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) {
379  // CHECK-DAG:   [[SLICE1:%.*]] = vector.transfer_read [[IB1]][[[IV]]], [[ZERO]]
380  // CHECK:       [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]]
381  // CHECK-NEXT:  [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB1]][[[IV]]]
382    %dup2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32>
383    %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32>
384    %dup5 = arith.addf %dup3, %dup2 : vector<32xf32>
385    %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<32xf32>, tensor<128xf32>
386  // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}}
387    scf.yield %dup6 : tensor<128xf32>
388  } {source_loops}
389  return %1, %dup1, %2, %dup2 : tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>
390}
391
392
393module attributes {transform.with_named_sequence} {
394  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
395    %target_loops = transform.structured.match ops{["scf.for"]} attributes {target_loops} in %arg0 : (!transform.any_op) -> !transform.any_op
396    %source_loops = transform.structured.match ops{["scf.for"]} attributes {source_loops} in %arg0 : (!transform.any_op) -> !transform.any_op
397    transform.foreach %target_loops, %source_loops : !transform.any_op, !transform.any_op {
398    ^bb0(%target_loop: !transform.any_op, %source_loop: !transform.any_op):
399      %fused = transform.loop.fuse_sibling %target_loop into %source_loop : (!transform.any_op,!transform.any_op) ->  !transform.any_op
400    }
401    transform.yield
402  }
403}
404