xref: /llvm-project/mlir/test/Dialect/Affine/loop-fusion.mlir (revision eaa32d20a2612370371047140734e91f8f22dea1)
1// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s
2
3// Part II of fusion tests in  mlir/test/Transforms/loop-fusion=2.mlir.
4// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
5// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir
6
7// TODO: Add more tests:
8// *) Add nested fusion test cases when non-constant loop bound support is
9//    added to iteration domain in dependence check.
10// *) Add a test w/ floordiv/ceildiv/mod when supported in dependence check.
11// *) Add tests which check fused computation slice indexing and loop bounds.
12// TODO: Test clean up: move memref allocs to func args.
13
14// -----
15
16// CHECK-LABEL: func @should_fuse_raw_dep_for_locality() {
17func.func @should_fuse_raw_dep_for_locality() {
18  %m = memref.alloc() : memref<10xf32>
19  %cf7 = arith.constant 7.0 : f32
20
21  affine.for %i0 = 0 to 10 {
22    affine.store %cf7, %m[%i0] : memref<10xf32>
23  }
24  affine.for %i1 = 0 to 10 {
25    %v0 = affine.load %m[%i1] : memref<10xf32>
26  }
27  // CHECK:      affine.for %{{.*}} = 0 to 10 {
28  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
29  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
30  // CHECK-NEXT: }
31  // CHECK-NEXT: return
32  return
33}
34
35// -----
36
37// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
38func.func @should_fuse_reduction_to_pointwise() {
39  %a = memref.alloc() : memref<10x10xf32>
40  %b = memref.alloc() : memref<10xf32>
41  %c = memref.alloc() : memref<10xf32>
42
43  %cf7 = arith.constant 7.0 : f32
44
45  affine.for %i0 = 0 to 10 {
46    affine.for %i1 = 0 to 10 {
47      %v0 = affine.load %b[%i0] : memref<10xf32>
48      %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32>
49      %v3 = arith.addf %v0, %v1 : f32
50      affine.store %v3, %b[%i0] : memref<10xf32>
51    }
52  }
53  affine.for %i2 = 0 to 10 {
54    %v4 = affine.load %b[%i2] : memref<10xf32>
55    affine.store %v4, %c[%i2] : memref<10xf32>
56  }
57
58  // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
59  // is not used in the access function of the store/load on %b.
60  // CHECK:       affine.for %{{.*}} = 0 to 10 {
61  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
62  // CHECK-NEXT:      affine.load %{{.*}}[0] : memref<1xf32>
63  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
64  // CHECK-NEXT:      arith.addf %{{.*}}, %{{.*}} : f32
65  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
66  // CHECK-NEXT:    }
67  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
68  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
69  // CHECK-NEXT:  }
70  // CHECK-NEXT:  return
71  return
72}
73
74// -----
75
76// CHECK-DAG: [[$MAP_SHIFT_MINUS_ONE_R1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 1)>
77// CHECK-DAG: [[$MAP_SHIFT_D0_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 + 1)>
78// CHECK-DAG: [[$MAP_SHIFT_D1_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d1 + 1)>
79
80// CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() {
81func.func @should_fuse_loop_nests_with_shifts() {
82  %a = memref.alloc() : memref<10x10xf32>
83  %cf7 = arith.constant 7.0 : f32
84
85  affine.for %i0 = 0 to 9 {
86    affine.for %i1 = 0 to 9 {
87      affine.store %cf7, %a[%i0 + 1, %i1 + 1] : memref<10x10xf32>
88    }
89  }
90  affine.for %i2 = 1 to 10 {
91    affine.for %i3 = 1 to 10 {
92      %v0 = affine.load %a[%i2, %i3] : memref<10x10xf32>
93    }
94  }
95
96  // Source slice affine apply sequence:
97  // *) First two affine apply's map from the dst to src iteration space.
98  // *) Third affine apply is access function around src store.
99  // *) Fourth affine apply shifts the stores access function by '-1', because
100  //    of the offset induced by reducing the memref shape from 10x10 to 9x9.
101  // *) Fifth affine apply shifts the loads access function by '-1', because
102  //    of the offset induced by reducing the memref shape from 10x10 to 9x9.
103  // NOTE: Should create a private memref with reduced shape 9x9xf32.
104  // CHECK:      affine.for %{{.*}} = 1 to 10 {
105  // CHECK-NEXT:   affine.for %{{.*}} = 1 to 10 {
106  // CHECK-NEXT:     %[[I:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
107  // CHECK-NEXT:     %[[J:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
108  // CHECK-NEXT:     affine.apply [[$MAP_SHIFT_D0_BY_ONE]](%[[I]], %[[J]])
109  // CHECK-NEXT:     affine.apply [[$MAP_SHIFT_D1_BY_ONE]](%[[I]], %[[J]])
110  // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
111  // CHECK-NEXT:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
112  // CHECK-NEXT:   }
113  // CHECK-NEXT: }
114  // CHECK-NEXT: return
115  return
116}
117
118// -----
119
120// CHECK-LABEL: func @should_fuse_loop_nest() {
121func.func @should_fuse_loop_nest() {
122  %a = memref.alloc() : memref<10x10xf32>
123  %b = memref.alloc() : memref<10x10xf32>
124  %cf7 = arith.constant 7.0 : f32
125
126  affine.for %i0 = 0 to 10 {
127    affine.for %i1 = 0 to 10 {
128      affine.store %cf7, %a[%i0, %i1] : memref<10x10xf32>
129    }
130  }
131  affine.for %i2 = 0 to 10 {
132    affine.for %i3 = 0 to 10 {
133      %v0 = affine.load %a[%i3, %i2] : memref<10x10xf32>
134      affine.store %v0, %b[%i2, %i3] : memref<10x10xf32>
135    }
136  }
137  affine.for %i4 = 0 to 10 {
138    affine.for %i5 = 0 to 10 {
139      %v1 = affine.load %b[%i4, %i5] : memref<10x10xf32>
140    }
141  }
142  // Expecting private memref for '%a' first, then private memref for '%b'.
143  // CHECK-DAG:  [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32>
144  // CHECK-DAG:  [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32>
145  // CHECK:      affine.for %{{.*}} = 0 to 10 {
146  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10 {
147  // CHECK-NEXT:     affine.store %{{.*}}, [[NEWA]][0, 0] : memref<1x1xf32>
148  // CHECK-NEXT:     affine.load [[NEWA]][0, 0] : memref<1x1xf32>
149  // CHECK-NEXT:     affine.store %{{.*}}, [[NEWB]][0, 0] : memref<1x1xf32>
150  // CHECK-NEXT:     affine.load [[NEWB]][0, 0] : memref<1x1xf32>
151  // CHECK-NEXT:   }
152  // CHECK-NEXT: }
153  // CHECK-NEXT: return
154  return
155}
156
157// -----
158
159// CHECK-LABEL: func @should_fuse_across_intermediate_loop_with_no_deps() {
160func.func @should_fuse_across_intermediate_loop_with_no_deps() {
161  %a = memref.alloc() : memref<10xf32>
162  %b = memref.alloc() : memref<10xf32>
163  %c = memref.alloc() : memref<10xf32>
164
165  %cf7 = arith.constant 7.0 : f32
166
167  affine.for %i0 = 0 to 10 {
168    %v0 = affine.load %a[%i0] : memref<10xf32>
169    affine.store %v0, %b[%i0] : memref<10xf32>
170  }
171  affine.for %i1 = 0 to 10 {
172    affine.store %cf7, %c[%i1] : memref<10xf32>
173  }
174  affine.for %i2 = 0 to 10 {
175    %v1 = affine.load %b[%i2] : memref<10xf32>
176  }
177
178  // Should fuse first loop (past second loop with no dependences) into third.
179  // Note that fusion creates a private memref '%2' for the fused loop nest.
180  // CHECK:      affine.for %{{.*}} = 0 to 10 {
181  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
182  // CHECK-NEXT: }
183  // CHECK:      affine.for %{{.*}} = 0 to 10 {
184  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
185  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
186  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
187  // CHECK-NEXT: }
188  // CHECK-NEXT: return
189  return
190}
191
192// -----
193
194// CHECK-LABEL: func @should_fuse_all_loops() {
195func.func @should_fuse_all_loops() {
196  %a = memref.alloc() : memref<10xf32>
197  %b = memref.alloc() : memref<10xf32>
198  %cf7 = arith.constant 7.0 : f32
199
200  // Set up flow dependences from first and second loops to third.
201  affine.for %i0 = 0 to 10 {
202    affine.store %cf7, %a[%i0] : memref<10xf32>
203  }
204  affine.for %i1 = 0 to 10 {
205    affine.store %cf7, %b[%i1] : memref<10xf32>
206  }
207  affine.for %i2 = 0 to 10 {
208    %v0 = affine.load %a[%i2] : memref<10xf32>
209    %v1 = affine.load %b[%i2] : memref<10xf32>
210  }
211
212  // Should fuse first and second loops into third.
213  // Expecting private memref for '%a' first, then private memref for '%b'.
214  // CHECK-DAG: [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32>
215  // CHECK-DAG: [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32>
216  // CHECK:      affine.for %{{.*}} = 0 to 10 {
217  // CHECK-NEXT:   affine.store %{{.*}}, [[NEWA]][0] : memref<1xf32>
218  // CHECK-NEXT:   affine.store %{{.*}}, [[NEWB]][0] : memref<1xf32>
219  // CHECK-NEXT:   affine.load [[NEWA]][0] : memref<1xf32>
220  // CHECK-NEXT:   affine.load [[NEWB]][0] : memref<1xf32>
221  // CHECK-NEXT: }
222  // CHECK-NEXT: return
223  return
224}
225
226// -----
227
228// CHECK-LABEL: func @should_fuse_first_and_second_loops() {
229func.func @should_fuse_first_and_second_loops() {
230  %a = memref.alloc() : memref<10xf32>
231  %b = memref.alloc() : memref<10xf32>
232  %c = memref.alloc() : memref<10xf32>
233
234  %cf7 = arith.constant 7.0 : f32
235
236  affine.for %i0 = 0 to 10 {
237    affine.store %cf7, %a[%i0] : memref<10xf32>
238  }
239  affine.for %i1 = 0 to 10 {
240    %v0 = affine.load %a[%i1] : memref<10xf32>
241    affine.store %cf7, %b[%i1] : memref<10xf32>
242  }
243  affine.for %i2 = 0 to 10 {
244    %v1 = affine.load %c[%i2] : memref<10xf32>
245  }
246
247  // Should fuse first loop into the second (last loop should not be fused).
248  // Should create private memref '%2' for fused scf.
249  // CHECK:      affine.for %{{.*}} = 0 to 10 {
250  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
251  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
252  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
253  // CHECK-NEXT: }
254  // CHECK:      affine.for %{{.*}} = 0 to 10 {
255  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
256  // CHECK-NEXT: }
257  // CHECK-NEXT: return
258
259  return
260}
261
262// -----
263
264// CHECK-LABEL: func @should_not_fuse_would_create_cycle() {
265func.func @should_not_fuse_would_create_cycle() {
266  %a = memref.alloc() : memref<10xf32>
267  %b = memref.alloc() : memref<10xf32>
268  %c = memref.alloc() : memref<10xf32>
269
270  %cf7 = arith.constant 7.0 : f32
271
272  // Set up the following dependences:
273  // 1) loop0 -> loop1 on memref '%{{.*}}'
274  // 2) loop0 -> loop2 on memref '%{{.*}}'
275  // 3) loop1 -> loop2 on memref '%{{.*}}'
276  affine.for %i0 = 0 to 10 {
277    %v0 = affine.load %a[%i0] : memref<10xf32>
278    affine.store %cf7, %b[%i0] : memref<10xf32>
279  }
280  affine.for %i1 = 0 to 10 {
281    affine.store %cf7, %a[%i1] : memref<10xf32>
282    %v1 = affine.load %c[%i1] : memref<10xf32>
283  }
284  affine.for %i2 = 0 to 10 {
285    %v2 = affine.load %b[%i2] : memref<10xf32>
286    affine.store %cf7, %c[%i2] : memref<10xf32>
287  }
288  // Should not fuse: fusing loop first loop into last would create a cycle.
289  // CHECK:      affine.for %{{.*}} = 0 to 10 {
290  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
291  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
292  // CHECK-NEXT: }
293  // CHECK:      affine.for %{{.*}} = 0 to 10 {
294  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
295  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
296  // CHECK-NEXT: }
297  // CHECK:      affine.for %{{.*}} = 0 to 10 {
298  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
299  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
300  // CHECK-NEXT: }
301  // CHECK-NEXT: return
302  return
303}
304
305// -----
306
307// CHECK-LABEL: func @should_fuse_producer_consumer() {
308func.func @should_fuse_producer_consumer() {
309  %m = memref.alloc() : memref<10xf32>
310  %cf7 = arith.constant 7.0 : f32
311
312  affine.for %i0 = 0 to 10 {
313    affine.store %cf7, %m[%i0] : memref<10xf32>
314  }
315  affine.for %i1 = 0 to 10 {
316    affine.store %cf7, %m[%i1] : memref<10xf32>
317  }
318  affine.for %i2 = 0 to 10 {
319    %v1 = affine.load %m[%i2] : memref<10xf32>
320  }
321  // Fusing loop %i0 to %i2 would violate the WAW dependence between %i0 and
322  // %i1, but OK to fuse %i1 into %i2.
323  // TODO: When the fusion pass is run to a fixed-point, it should
324  // fuse all three of these loop nests.
325  // CHECK:      memref.alloc() : memref<1xf32>
326  // CHECK:      affine.for %{{.*}} = 0 to 10 {
327  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
328  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
329  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
330  // CHECK-NEXT: }
331  // CHECK-NEXT: return
332  return
333}
334
335// -----
336
337// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
338func.func @should_fuse_and_move_to_preserve_war_dep() {
339  %a = memref.alloc() : memref<10xf32>
340  %b = memref.alloc() : memref<10xf32>
341  %cf7 = arith.constant 7.0 : f32
342
343  affine.for %i0 = 0 to 10 {
344    %v0 = affine.load %a[%i0] : memref<10xf32>
345    affine.store %v0, %b[%i0] : memref<10xf32>
346  }
347  affine.for %i1 = 0 to 10 {
348    affine.store %cf7, %a[%i1] : memref<10xf32>
349  }
350  affine.for %i2 = 0 to 10 {
351    %v1 = affine.load %b[%i2] : memref<10xf32>
352  }
353  // Loops '%i1' and '%i2' have no dependences. We can fuse a slice of '%i0'
354  // into '%i2' if we move the fused loop nest before '%i1', which preserves
355  // the WAR dependence from load '%a' in '%i0' to the store '%a' in loop '%i1'.
356  // CHECK:       affine.for %{{.*}} = 0 to 10 {
357  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
358  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
359  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
360  // CHECK-NEXT:  }
361  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
362  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
363  // CHECK-NEXT:  }
364  // CHECK-NEXT:  return
365  return
366}
367
368// -----
369
370// CHECK-LABEL: func @should_fuse_if_top_level_access() {
371func.func @should_fuse_if_top_level_access() {
372  %m = memref.alloc() : memref<10xf32>
373  %cf7 = arith.constant 7.0 : f32
374
375  affine.for %i0 = 0 to 10 {
376    affine.store %cf7, %m[%i0] : memref<10xf32>
377  }
378  affine.for %i1 = 0 to 10 {
379    %v0 = affine.load %m[%i1] : memref<10xf32>
380  }
381
382  %c0 = arith.constant 4 : index
383  %v1 = affine.load %m[%c0] : memref<10xf32>
384  // Top-level load to '%m' should prevent creating a private memref but
385  // loop nests should be fused and '%i0' should be removed.
386  // CHECK:      %[[m:.*]] = memref.alloc() : memref<10xf32>
387  // CHECK-NOT:  memref.alloc
388
389  // CHECK:      affine.for %[[i1:.*]] = 0 to 10 {
390  // CHECK-NEXT:   affine.store %{{.*}}, %[[m]][%[[i1]]] : memref<10xf32>
391  // CHECK-NEXT:   affine.load %[[m]][%[[i1]]] : memref<10xf32>
392  // CHECK-NEXT: }
393  // CHECK:      affine.load %[[m]][%{{.*}}] : memref<10xf32>
394  return
395}
396
397// -----
398
399// CHECK-LABEL: func @should_fuse_but_not_remove_src() {
400func.func @should_fuse_but_not_remove_src() {
401  %m = memref.alloc() : memref<100xf32>
402  %cf7 = arith.constant 7.0 : f32
403
404  affine.for %i0 = 0 to 100 {
405    affine.store %cf7, %m[%i0] : memref<100xf32>
406  }
407  affine.for %i1 = 0 to 17 {
408    %v0 = affine.load %m[%i1] : memref<100xf32>
409  }
410  %v1 = affine.load %m[99] : memref<100xf32>
411
412  // Loop '%i0' and '%i1' should be fused but '%i0' shouldn't be removed to
413  // preserve the dependence with the top-level access.
414  // CHECK:      affine.for %{{.*}} = 0 to 100 {
415  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<100xf32>
416  // CHECK-NEXT: }
417  // CHECK-NEXT: affine.for %{{.*}} = 0 to 17 {
418  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
419  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
420  // CHECK-NEXT: }
421  // CHECK-NEXT: affine.load %{{.*}}[99] : memref<100xf32>
422  // CHECK-NEXT: return
423  return
424}
425
426// -----
427
428// CHECK-LABEL: func @should_fuse_no_top_level_access() {
429func.func @should_fuse_no_top_level_access() {
430  %m = memref.alloc() : memref<10xf32>
431  %cf7 = arith.constant 7.0 : f32
432
433  affine.for %i0 = 0 to 10 {
434    affine.store %cf7, %m[%i0] : memref<10xf32>
435  }
436  affine.for %i1 = 0 to 10 {
437    %v0 = affine.load %m[%i1] : memref<10xf32>
438  }
439  // CHECK:      affine.for %{{.*}} = 0 to 10 {
440  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
441  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
442  // CHECK-NEXT: }
443  // CHECK-NEXT: return
444  return
445}
446
447// -----
448
449#set0 = affine_set<(d0) : (1 == 0)>
450
451// CHECK-LABEL: func @should_not_fuse_if_op_at_top_level() {
452func.func @should_not_fuse_if_op_at_top_level() {
453  %m = memref.alloc() : memref<10xf32>
454  %cf7 = arith.constant 7.0 : f32
455
456  affine.for %i0 = 0 to 10 {
457    affine.store %cf7, %m[%i0] : memref<10xf32>
458  }
459  affine.for %i1 = 0 to 10 {
460    %v0 = affine.load %m[%i1] : memref<10xf32>
461  }
462  %c0 = arith.constant 4 : index
463  affine.if #set0(%c0) {
464  }
465  // Top-level IfOp should prevent fusion.
466  // CHECK:      affine.for %{{.*}} = 0 to 10 {
467  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
468  // CHECK-NEXT: }
469  // CHECK:      affine.for %{{.*}} = 0 to 10 {
470  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
471  // CHECK-NEXT: }
472  return
473}
474
475// -----
476
477#set0 = affine_set<(d0) : (1 == 0)>
478
479// CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest() {
480func.func @should_not_fuse_if_op_in_loop_nest() {
481  %m = memref.alloc() : memref<10xf32>
482  %cf7 = arith.constant 7.0 : f32
483  %c4 = arith.constant 4 : index
484
485  affine.for %i0 = 0 to 10 {
486    affine.store %cf7, %m[%i0] : memref<10xf32>
487  }
488  affine.for %i1 = 0 to 10 {
489    affine.if #set0(%c4) {
490    }
491    %v0 = affine.load %m[%i1] : memref<10xf32>
492  }
493
494  // IfOp in ForOp should prevent fusion.
495  // CHECK:      affine.for %{{.*}} = 0 to 10 {
496  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
497  // CHECK-NEXT: }
498  // CHECK:      affine.for %{{.*}} = 0 to 10 {
499  // CHECK-NEXT:   affine.if #set(%{{.*}}) {
500  // CHECK-NEXT:   }
501  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
502  // CHECK-NEXT: }
503  return
504}
505
506// -----
507
508#set = affine_set<(d0) : (d0 - 1 >= 0)>
509
510// CHECK-LABEL: func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> {
511func.func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> {
512  %a = memref.alloc() : memref<10xf32>
513  %b = memref.alloc() : memref<10xf32>
514  %cf7 = arith.constant 7.0 : f32
515
516  affine.for %i0 = 0 to 10 {
517    affine.store %cf7, %a[%i0] : memref<10xf32>
518  }
519  affine.for %i1 = 0 to 10 {
520    %v0 = affine.load %a[%i1] : memref<10xf32>
521    affine.store %v0, %b[%i1] : memref<10xf32>
522  }
523  affine.for %i2 = 0 to 10 {
524    affine.if #set(%i2) {
525      %v0 = affine.load %b[%i2] : memref<10xf32>
526    }
527  }
528
529  // IfOp in ForOp should not prevent fusion if it does not in between the
530  // source and dest ForOp ops.
531
532  // CHECK:      affine.for
533  // CHECK-NEXT:   affine.store
534  // CHECK-NEXT:   affine.load
535  // CHECK-NEXT:   affine.store
536  // CHECK:      affine.for
537  // CHECK-NEXT:   affine.if
538  // CHECK-NEXT:     affine.load
539  // CHECK-NOT:  affine.for
540  // CHECK:      return
541
542  return %a : memref<10xf32>
543}
544
545// -----
546
547#set = affine_set<(d0) : (d0 - 1 >= 0)>
548
549// CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> {
550func.func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> {
551  %a = memref.alloc() : memref<10xf32>
552  %b = memref.alloc() : memref<10xf32>
553  %cf7 = arith.constant 7.0 : f32
554
555  affine.for %i0 = 0 to 10 {
556    affine.store %cf7, %a[%i0] : memref<10xf32>
557  }
558  affine.for %i1 = 0 to 10 {
559    affine.if #set(%i1) {
560      affine.store %cf7, %a[%i1] : memref<10xf32>
561    }
562  }
563  affine.for %i3 = 0 to 10 {
564    %v0 = affine.load %a[%i3] : memref<10xf32>
565    affine.store %v0, %b[%i3] : memref<10xf32>
566  }
567  return %b : memref<10xf32>
568
569  // IfOp in ForOp which modifies the memref should prevent fusion if it is in
570  // between the source and dest ForOp.
571
572  // CHECK:      affine.for
573  // CHECK-NEXT:   affine.store
574  // CHECK:      affine.for
575  // CHECK-NEXT:   affine.if
576  // CHECK-NEXT:     affine.store
577  // CHECK:      affine.for
578  // CHECK-NEXT:   affine.load
579  // CHECK-NEXT:   affine.store
580  // CHECK:      return
581}
582
583// -----
584
585// CHECK-LABEL: func @permute_and_fuse() {
586func.func @permute_and_fuse() {
587  %m = memref.alloc() : memref<10x20x30xf32>
588
589  %cf7 = arith.constant 7.0 : f32
590  affine.for %i0 = 0 to 10 {
591    affine.for %i1 = 0 to 20 {
592      affine.for %i2 = 0 to 30 {
593        affine.store %cf7, %m[%i0, %i1, %i2] : memref<10x20x30xf32>
594      }
595    }
596  }
597  affine.for %i3 = 0 to 30 {
598    affine.for %i4 = 0 to 10 {
599      affine.for %i5 = 0 to 20 {
600        %v0 = affine.load %m[%i4, %i5, %i3] : memref<10x20x30xf32>
601        "foo"(%v0) : (f32) -> ()
602      }
603    }
604  }
605// CHECK:       affine.for %{{.*}} = 0 to 30 {
606// CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
607// CHECK-NEXT:      affine.for %{{.*}} = 0 to 20 {
608// CHECK-NEXT:        affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
609// CHECK-NEXT:        affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
610// CHECK-NEXT:        "foo"(%{{.*}}) : (f32) -> ()
611// CHECK-NEXT:      }
612// CHECK-NEXT:    }
613// CHECK-NEXT:  }
614// CHECK-NEXT:  return
615
616  return
617}
618
619// -----
620
621// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)>
622// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)>
623// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)>
624
625// Reshape from a 64 x f32 to 16 x 4 x f32.
626// CHECK-LABEL: func @fuse_reshape_64_16_4
627func.func @fuse_reshape_64_16_4(%in : memref<64xf32>) {
628  %out = memref.alloc() : memref<16x4xf32>
629
630  affine.for %i0 = 0 to 64 {
631    %v = affine.load %in[%i0] : memref<64xf32>
632    affine.store %v, %out[%i0 floordiv 4, %i0 mod 4] : memref<16x4xf32>
633  }
634
635  affine.for %i1 = 0 to 16 {
636    affine.for %i2 = 0 to 4 {
637      %w = affine.load %out[%i1, %i2] : memref<16x4xf32>
638      "foo"(%w) : (f32) -> ()
639    }
640  }
641  return
642  // CHECK:      affine.for %{{.*}} =
643  // CHECK-NEXT:   affine.for %{{.*}} =
644  // CHECK-NOT:    for
645  // CHECK:        }
646  // CHECK-NEXT: }
647  // CHECK-NEXT: return
648}
649
650// -----
651// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)>
652// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)>
653// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)>
654
655// Reshape a 16x4xf32 to 64xf32.
656// CHECK-LABEL: func @fuse_reshape_16_4_64
657func.func @fuse_reshape_16_4_64() {
658  %in = memref.alloc() : memref<16x4xf32>
659  %out = memref.alloc() : memref<64xf32>
660
661  affine.for %i0 = 0 to 16 {
662    affine.for %i1 = 0 to 4 {
663      %v = affine.load %in[%i0, %i1] : memref<16x4xf32>
664      affine.store %v, %out[4*%i0 + %i1] : memref<64xf32>
665    }
666  }
667
668  affine.for %i2 = 0 to 64 {
669    %w = affine.load %out[%i2] : memref<64xf32>
670    "foo"(%w) : (f32) -> ()
671  }
672// CHECK:       affine.for %{{.*}} = 0 to 64 {
673// CHECK-NEXT:    affine.apply [[$MAP0]](%{{.*}})
674// CHECK-NEXT:    affine.apply [[$MAP1]](%{{.*}})
675// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<16x4xf32>
676// CHECK-NEXT:    affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
677// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
678// CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
679// CHECK-NEXT:    "foo"(%{{.*}}) : (f32) -> ()
680// CHECK-NEXT:  }
681// CHECK-NEXT:  return
682  return
683}
684
685
686// -----
687
688// All three loop nests below (6-d one, 2-d one, 2-d one is fused into a single
689// 2-d loop nest).
690func.func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
691  %in = memref.alloc() : memref<2x2x3x3x16x1xi32>
692  %out = memref.alloc() : memref<64x9xi32>
693  %live_out = memref.alloc() : memref<64x9xi32>
694
695  // Initialize input.
696  affine.for %i0 = 0 to 2 {
697    affine.for %i1 = 0 to 2 {
698      affine.for %i2 = 0 to 3 {
699        affine.for %i3 = 0 to 3 {
700          affine.for %i4 = 0 to 16 {
701            affine.for %i5 = 0 to 1 {
702              %val = "foo"(%i0, %i1, %i2, %i3, %i4, %i5) : (index, index, index, index, index, index) -> i32
703              affine.store %val, %in[%i0, %i1, %i2, %i3, %i4, %i5] : memref<2x2x3x3x16x1xi32>
704            }
705          }
706        }
707      }
708    }
709  }
710
711  affine.for %ii = 0 to 64 {
712    affine.for %jj = 0 to 9 {
713      // Convert output coordinates to linear index.
714      %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 9 + d1)> (%ii, %jj)
715      %0 = affine.apply affine_map<(d0) -> (d0 floordiv (2 * 3 * 3 * 16 * 1))>(%a0)
716      %1 = affine.apply affine_map<(d0) -> ((d0 mod 288) floordiv (3 * 3 * 16 * 1))>(%a0)
717      %2 = affine.apply affine_map<(d0) -> (((d0 mod 288) mod 144) floordiv (3 * 16 * 1))>(%a0)
718      %3 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv (16 * 1))>(%a0)
719      %4 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)>(%a0)
720      %5 = affine.apply affine_map<(d0) -> (((((d0 mod 144) mod 144) mod 48) mod 16) mod 1)>(%a0)
721      %v = affine.load %in[%0, %1, %2, %3, %4, %5] : memref<2x2x3x3x16x1xi32>
722      affine.store %v, %out[%ii, %jj] : memref<64x9xi32>
723    }
724  }
725
726  affine.for %i = 0 to 64 {
727    affine.for %j = 0 to 9 {
728      %a = affine.load %out[%i, %j] : memref<64x9xi32>
729      %b = arith.muli %a, %a : i32
730      affine.store %b, %live_out[%i, %j] : memref<64x9xi32>
731    }
732  }
733  return %live_out : memref<64x9xi32>
734}
735// Everything above is fused to a single 2-d loop nest, and the 6-d tensor %in
736// is eliminated if -memref-dataflow-opt is also supplied.
737//
738// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) floordiv 288)>
739// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 288) floordiv 144)>
740// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 144) floordiv 48)>
741// CHECK-DAG: [[$MAP3:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 48) floordiv 16)>
742// CHECK-DAG: [[$MAP4:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) mod 16)>
743// CHECK-DAG: [[$MAP11:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 9 + d1)>
744// CHECK-DAG: [[$MAP12:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 288)>
745// CHECK-DAG: [[$MAP13:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 288) floordiv 144)>
746// CHECK-DAG: [[$MAP14:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 144) floordiv 48)>
747// CHECK-DAG: [[$MAP15:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 48) floordiv 16)>
748// CHECK-DAG: [[$MAP16:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 16)>
749// CHECK-DAG: [[$MAP17:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (0)>
750
751//
752// CHECK-LABEL: func @R6_to_R2_reshape
753// CHECK:       memref.alloc() : memref<1x2x3x3x16x1xi32>
754// CHECK:       memref.alloc() : memref<1x1xi32>
755// CHECK:       memref.alloc() : memref<64x9xi32>
756// CHECK-NEXT:  affine.for %{{.*}} = 0 to 64 {
757// CHECK-NEXT:    affine.for %{{.*}} = 0 to 9 {
758// CHECK-NEXT:      affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
759// CHECK-NEXT:      affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
760// CHECK-NEXT:      affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
761// CHECK-NEXT:      affine.apply [[$MAP3]](%{{.*}}, %{{.*}})
762// CHECK-NEXT:      affine.apply [[$MAP4]](%{{.*}}, %{{.*}})
763// CHECK-NEXT:      "foo"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index, index, index, index) -> i32
764// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32>
765// CHECK-NEXT:      affine.apply [[$MAP11]](%{{.*}}, %{{.*}})
766// CHECK-NEXT:      affine.apply [[$MAP12]](%{{.*}})
767// CHECK-NEXT:      affine.apply [[$MAP13]](%{{.*}})
768// CHECK-NEXT:      affine.apply [[$MAP14]](%{{.*}})
769// CHECK-NEXT:      affine.apply [[$MAP15]](%{{.*}})
770// CHECK-NEXT:      affine.apply [[$MAP16]](%{{.*}})
771// CHECK-NEXT:      affine.apply [[$MAP17]](%{{.*}})
772// CHECK-NEXT:      affine.load %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32>
773// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xi32>
774// CHECK-NEXT:      affine.load %{{.*}}[0, 0] : memref<1x1xi32>
775// CHECK-NEXT:      arith.muli %{{.*}}, %{{.*}} : i32
776// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x9xi32>
777// CHECK-NEXT:    }
778// CHECK-NEXT:  }
779// CHECK-NEXT:  return %{{.*}} : memref<64x9xi32>
780
781// -----
782
783// CHECK-LABEL: func @fuse_symbolic_bounds
784func.func @fuse_symbolic_bounds(%M : index, %N : index) {
785  %N_plus_5 = affine.apply affine_map<(d0) -> (d0 + 5)>(%N)
786  %m = memref.alloc(%M, %N_plus_5) : memref<? x ? x f32>
787
788  %c0 = arith.constant 0.0 : f32
789  %s = arith.constant 5 : index
790
791  affine.for %i0 = 0 to %M {
792    affine.for %i1 = 0 to affine_map<(d0) -> (d0 + 5)> (%N) {
793      affine.store %c0, %m[%i0, %i1] : memref<? x ? x f32>
794    }
795  }
796
797  affine.for %i2 = 0 to %M {
798    affine.for %i3 = 0 to %N {
799      %v = affine.load %m[%i2, %i3 + symbol(%s)] : memref<? x ? x f32>
800    }
801  }
802
803  return
804}
805
806// -----
807
808// CHECK-LABEL: func @should_fuse_reduction_at_depth_of_one
809func.func @should_fuse_reduction_at_depth_of_one() {
810  %a = memref.alloc() : memref<10x100xf32>
811  %b = memref.alloc() : memref<10xf32>
812
813  affine.for %i0 = 0 to 10 {
814    affine.for %i1 = 0 to 100 {
815      %v0 = affine.load %b[%i0] : memref<10xf32>
816      %v1 = affine.load %a[%i0, %i1] : memref<10x100xf32>
817      %v2 = "maxf"(%v0, %v1) : (f32, f32) -> f32
818      affine.store %v2, %b[%i0] : memref<10xf32>
819    }
820  }
821  affine.for %i2 = 0 to 10 {
822    affine.for %i3 = 0 to 100 {
823      %v3 = affine.load %b[%i2] : memref<10xf32>
824      %v4 = affine.load %a[%i2, %i3] : memref<10x100xf32>
825      %v5 = arith.subf %v4, %v3 : f32
826      affine.store %v5, %b[%i2] : memref<10xf32>
827    }
828  }
829  // This test should fuse the src reduction loop at depth 1 in the destination
830  // loop nest, which improves locality and enables subsequence passes to
831  // decrease the reduction memref size and possibly place it in a faster
832  // memory space.
833  // CHECK:       affine.for %{{.*}} = 0 to 10 {
834  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
835  // CHECK-NEXT:      affine.load %{{.*}}[0] : memref<1xf32>
836  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
837  // CHECK-NEXT:      "maxf"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
838  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
839  // CHECK-NEXT:    }
840  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 100 {
841  // CHECK-NEXT:      affine.load %{{.*}}[0] : memref<1xf32>
842  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
843  // CHECK-NEXT:      arith.subf %{{.*}}, %{{.*}} : f32
844  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
845  // CHECK-NEXT:    }
846  // CHECK-NEXT:  }
847  // CHECK-NEXT:  return
848  return
849}
850
851// -----
852
853// CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1
854func.func @should_fuse_at_src_depth1_and_dst_depth1() {
855  %a = memref.alloc() : memref<100x16xf32>
856  %b = memref.alloc() : memref<100x16xf32>
857
858  affine.for %i0 = 0 to 100 {
859    affine.for %i1 = 0 to 16 {
860      %v0 = affine.load %a[%i0, %i1] : memref<100x16xf32>
861      "op0"(%v0) : (f32) -> ()
862    }
863    affine.for %i2 = 0 to 16 {
864      %v1 = "op1"() : () -> (f32)
865      affine.store %v1, %b[%i0, %i2] : memref<100x16xf32>
866    }
867  }
868
869  affine.for %i3 = 0 to 100 {
870    affine.for %i4 = 0 to 16 {
871      %v2 = affine.load %b[%i3, %i4] : memref<100x16xf32>
872      "op2"(%v2) : (f32) -> ()
873    }
874  }
875  // We can slice iterations of the '%i0' and '%i1' loops in the source
876  // loop nest, but slicing at depth 2 and inserting the slice in the
877  // destination loop nest at depth2 causes extra computation. Instead,
878  // the fusion algorithm should detect that the source loop should be sliced
879  // at depth 1 and the slice should be inserted at depth 1.
880  // CHECK:       affine.for %{{.*}} = 0 to 100 {
881  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
882  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x16xf32>
883  // CHECK-NEXT:      "op0"(%{{.*}}) : (f32) -> ()
884  // CHECK-NEXT:    }
885  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
886  // CHECK-NEXT:      %{{.*}} = "op1"() : () -> f32
887  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
888  // CHECK-NEXT:    }
889  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
890  // CHECK-NEXT:      affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
891  // CHECK-NEXT:      "op2"(%{{.*}}) : (f32) -> ()
892  // CHECK-NEXT:    }
893  // CHECK-NEXT:  }
894  // CHECK-NEXT:  return
895  return
896}
897
898// -----
899// CHECK: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 10 + d1)>
900
901// CHECK-LABEL: func @should_fuse_src_depth1_at_dst_depth2
902func.func @should_fuse_src_depth1_at_dst_depth2() {
903  %a = memref.alloc() : memref<100xf32>
904  %c0 = arith.constant 0.0 : f32
905
906  affine.for %i0 = 0 to 100 {
907    affine.store %c0, %a[%i0] : memref<100xf32>
908  }
909
910  affine.for %i1 = 0 to 10 {
911    affine.for %i2 = 0 to 10 {
912      %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 10 + d1)> (%i1, %i2)
913      %v0 = affine.load %a[%a0] : memref<100xf32>
914    }
915  }
916  // The source loop nest slice loop bound is a function of both destination
917  // loop IVs, so we should slice at depth 1 and insert the slice at depth 2.
918  // CHECK:       affine.for %{{.*}} = 0 to 10 {
919  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
920  // CHECK-NEXT:      affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
921  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
922  // CHECK-NEXT:      affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
923  // CHECK-NEXT:      affine.load %{{.*}}[0] : memref<1xf32>
924  // CHECK-NEXT:    }
925  // CHECK-NEXT:  }
926  // CHECK-NEXT:  return
927  return
928}
929
930// -----
931
932// CHECK-LABEL: func @fusion_at_depth0_not_currently_supported
933func.func @fusion_at_depth0_not_currently_supported() {
934  %0 = memref.alloc() : memref<10xf32>
935  %c0 = arith.constant 0 : index
936  %cst = arith.constant 0.000000e+00 : f32
937  affine.for %i0 = 0 to 10 {
938    affine.store %cst, %0[%i0] : memref<10xf32>
939  }
940  affine.for %i1 = 0 to 10 {
941    %1 = affine.load %0[%c0] : memref<10xf32>
942  }
943  // NOTE: Should shrink memref size to 1 element access by load in dst loop
944  // nest, and make the store in the slice store to the same element.
945  // CHECK-DAG:   memref.alloc() : memref<1xf32>
946  // CHECK:       affine.for %{{.*}} = 0 to 10 {
947  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
948  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
949  // CHECK-NEXT:  }
950  // CHECK-NEXT:  return
951  return
952}
953
954// -----
955
956// CHECK-LABEL: func @should_fuse_deep_loop_nests
957func.func @should_fuse_deep_loop_nests() {
958  %0 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2>
959  %1 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2>
960  %2 = memref.alloc() : memref<3x3x3x3x16x10xf32, 2>
961  %c0 = arith.constant 0 : index
962  %c1 = arith.constant 1 : index
963  %c1_0 = arith.constant 1 : index
964  %cst = arith.constant 0.000000e+00 : f32
965  affine.for %i0 = 0 to 2 {
966    affine.for %i1 = 0 to 2 {
967      affine.for %i2 = 0 to 3 {
968        affine.for %i3 = 0 to 3 {
969          affine.for %i4 = 0 to 16 {
970            affine.for %i5 = 0 to 10 {
971              %3 = affine.load %0[%i0, %i1, %i2, %i3, %i4, %i5]
972                : memref<2x2x3x3x16x10xf32, 2>
973            }
974          }
975          affine.for %i6 = 0 to 16 {
976            affine.for %i7 = 0 to 10 {
977              affine.store %cst, %1[%i0, %i1, %i2, %i3, %i6, %i7]
978                : memref<2x2x3x3x16x10xf32, 2>
979            }
980          }
981        }
982      }
983    }
984  }
985  affine.for %i8 = 0 to 3 {
986    affine.for %i9 = 0 to 3 {
987      affine.for %i10 = 0 to 2 {
988        affine.for %i11 = 0 to 2 {
989          affine.for %i12 = 0 to 3 {
990            affine.for %i13 = 0 to 3 {
991              affine.for %i14 = 0 to 2 {
992                affine.for %i15 = 0 to 2 {
993                  affine.for %i16 = 0 to 16 {
994                    affine.for %i17 = 0 to 10 {
995                      %5 = affine.load %0[%i14, %i15, %i12, %i13, %i16, %i17]
996                        : memref<2x2x3x3x16x10xf32, 2>
997                    }
998                  }
999                  affine.for %i18 = 0 to 16 {
1000                    affine.for %i19 = 0 to 10 {
1001                      %6 = affine.load %1[%i10, %i11, %i8, %i9, %i18, %i19]
1002                        : memref<2x2x3x3x16x10xf32, 2>
1003                    }
1004                  }
1005                }
1006              }
1007            }
1008          }
1009        }
1010      }
1011    }
1012  }
1013// The first four loops of the source loop nest can be sliced with iteration
1014// bounds which are a function of the first four loops of destination loop nest,
1015// where the destination loops nests have been interchanged.
1016
1017// CHECK-DAG:   memref.alloc() : memref<1x1x1x1x16x10xf32, 2>
1018// CHECK:       affine.for %{{.*}} = 0 to 3 {
1019// CHECK-NEXT:    affine.for %{{.*}} = 0 to 3 {
1020// CHECK-NEXT:      affine.for %{{.*}} = 0 to 2 {
1021// CHECK-NEXT:        affine.for %{{.*}} = 0 to 2 {
1022// CHECK-NEXT:          affine.for %{{.*}} = 0 to 3 {
1023// CHECK-NEXT:            affine.for %{{.*}} = 0 to 3 {
1024// CHECK-NEXT:              affine.for %{{.*}} = 0 to 16 {
1025// CHECK-NEXT:                affine.for %{{.*}} = 0 to 10 {
1026// CHECK-NEXT:                  affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
1027// CHECK-NEXT:                }
1028// CHECK-NEXT:              }
1029// CHECK-NEXT:              affine.for %{{.*}} = 0 to 16 {
1030// CHECK-NEXT:                affine.for %{{.*}} = 0 to 10 {
1031// CHECK-NEXT:                  affine.store %{{.*}}, %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
1032// CHECK-NEXT:                }
1033// CHECK-NEXT:              }
1034// CHECK-NEXT:              affine.for %{{.*}} = 0 to 2 {
1035// CHECK-NEXT:                affine.for %{{.*}} = 0 to 2 {
1036// CHECK-NEXT:                  affine.for %{{.*}} = 0 to 16 {
1037// CHECK-NEXT:                    affine.for %{{.*}} = 0 to 10 {
1038// CHECK-NEXT:                      affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
1039// CHECK-NEXT:                    }
1040// CHECK-NEXT:                  }
1041// CHECK-NEXT:                  affine.for %{{.*}} = 0 to 16 {
1042// CHECK-NEXT:                    affine.for %{{.*}} = 0 to 10 {
1043// CHECK-NEXT:                      affine.load %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
1044// CHECK-NEXT:                    }
1045// CHECK-NEXT:                  }
1046// CHECK-NEXT:                }
1047// CHECK-NEXT:              }
1048// CHECK-NEXT:            }
1049// CHECK-NEXT:          }
1050// CHECK-NEXT:        }
1051// CHECK-NEXT:      }
1052// CHECK-NEXT:    }
1053// CHECK-NEXT:  }
1054// CHECK-NEXT:  return
1055  return
1056}
1057
1058// -----
1059
1060// CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count
1061func.func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
1062  %a = memref.alloc() : memref<4x256xf32>
1063  %b = memref.alloc() : memref<4x256xf32>
1064
1065  %c0 = arith.constant 0 : index
1066  %cf0 = arith.constant 0.0 : f32
1067
1068  affine.for %i0 = 0 to 4 {
1069    affine.for %i1 = 0 to 256 {
1070      %v0 = affine.load %b[%i0, %i1] : memref<4x256xf32>
1071    }
1072    affine.for %i2 = 0 to 256 {
1073      affine.store %cf0, %a[%i0, %i2] : memref<4x256xf32>
1074    }
1075  }
1076
1077  affine.for %d0 = 0 to 4 {
1078    affine.for %d1 = 0 to 16 {
1079      %v1 = affine.load %a[%d0, %d1] : memref<4x256xf32>
1080    }
1081  }
1082  // The cost of fusing at depth 2 is greater than the cost of fusing at depth 1
1083  // for two reasons:
1084  // 1) Inserting the unsliceable src loop %i1 to a higher depth removes
1085  //    redundant computation and reduces costs.
1086  // 2) Inserting the sliceable src loop %i2 at depth 1, we can still reduce
1087  //    its trip count to 16 (from 256) reducing costs.
1088  // NOTE: the size of the private memref created for the fused loop nest
1089  // is reduced from the original shape from 4x256 to 4x16 because of the
1090  // data accessed by the load.
1091  // CHECK-DAG:   memref.alloc() : memref<1x16xf32>
1092  // CHECK:       affine.for %{{.*}} = 0 to 4 {
1093  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
1094  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4x256xf32>
1095  // CHECK-NEXT:    }
1096  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
1097  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
1098  // CHECK-NEXT:    }
1099  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 16 {
1100  // CHECK-NEXT:      affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
1101  // CHECK-NEXT:    }
1102  // CHECK-NEXT:  }
1103  // CHECK-NEXT:  return
1104  return
1105}
1106
1107// -----
1108
1109// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_20
1110func.func @should_fuse_at_depth1_with_trip_count_20() {
1111  %a = memref.alloc() : memref<100xf32>
1112  %c0 = arith.constant 0 : index
1113  %cf0 = arith.constant 0.0 : f32
1114
1115  affine.for %i0 = 0 to 100 {
1116    affine.store %cf0, %a[%i0]: memref<100xf32>
1117  }
1118
1119  affine.for %i1 = 0 to 5 {
1120    affine.for %i2 = 0 to 10 {
1121      %v0 = affine.load %a[%i2]: memref<100xf32>
1122    }
1123    affine.for %i3 = 0 to 10 {
1124      affine.for %i4 = 0 to 20 {
1125        %v1 = affine.load %a[%i4]: memref<100xf32>
1126      }
1127    }
1128  }
1129  // NOTE: The size of the private memref created for fusion is shrunk to 20xf32
1130  // CHECK-DAG:   memref.alloc() : memref<20xf32>
1131  // CHECK:       affine.for %{{.*}} = 0 to 5 {
1132  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 20 {
1133  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<20xf32>
1134  // CHECK-NEXT:    }
1135  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
1136  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
1137  // CHECK-NEXT:    }
1138  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
1139  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 20 {
1140  // CHECK-NEXT:        affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
1141  // CHECK-NEXT:      }
1142  // CHECK-NEXT:    }
1143  // CHECK-NEXT:  }
1144  // CHECK-NEXT:  return
1145  return
1146}
1147
1148// -----
1149
1150// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_19
1151func.func @should_fuse_at_depth1_with_trip_count_19() {
1152  %a = memref.alloc() : memref<100xf32>
1153  %c0 = arith.constant 0 : index
1154  %cf0 = arith.constant 0.0 : f32
1155
1156  affine.for %i0 = 0 to 100 {
1157    affine.store %cf0, %a[%i0]: memref<100xf32>
1158  }
1159
1160  affine.for %i1 = 0 to 5 {
1161    affine.for %i2 = 0 to 19 {
1162      %v0 = affine.load %a[%i2]: memref<100xf32>
1163    }
1164    affine.for %i3 = 0 to 10 {
1165      affine.for %i4 = 0 to 10 {
1166        %v1 = affine.load %a[%i4]: memref<100xf32>
1167      }
1168    }
1169  }
1170  // NOTE: The size of the private memref created for fusion is shrunk to 19xf32
1171  // CHECK-DAG:   memref.alloc() : memref<19xf32>
1172  // CHECK:       affine.for %{{.*}} = 0 to 5 {
1173  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 19 {
1174  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<19xf32>
1175  // CHECK-NEXT:    }
1176  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 19 {
1177  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
1178  // CHECK-NEXT:    }
1179  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
1180  // CHECK-NEXT:      affine.for %{{.*}} = 0 to 10 {
1181  // CHECK-NEXT:        affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
1182  // CHECK-NEXT:      }
1183  // CHECK-NEXT:    }
1184  // CHECK-NEXT:  }
1185  // CHECK-NEXT:  return
1186  return
1187}
1188
1189
1190// -----
1191
1192// CHECK-LABEL: func @should_fuse_with_private_memref() {
1193func.func @should_fuse_with_private_memref() {
1194  %m = memref.alloc() : memref<100xf32>
1195  %cf7 = arith.constant 7.0 : f32
1196
1197  affine.for %i0 = 0 to 100 {
1198    affine.store %cf7, %m[%i0] : memref<100xf32>
1199  }
1200  affine.for %i1 = 0 to 17 {
1201    %v0 = affine.load %m[%i1] : memref<100xf32>
1202  }
1203  affine.for %i2 = 0 to 82 {
1204    %v1 = affine.load %m[%i2] : memref<100xf32>
1205  }
1206  // Should create a new private memref.
1207  // CHECK-DAG:  memref.alloc() : memref<1xf32>
1208  // CHECK:      affine.for %{{.*}} = 0 to 17 {
1209  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1210  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
1211  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
1212  // CHECK-NEXT: }
1213  // CHECK-NEXT: return
1214  return
1215}
1216
1217// -----
1218
1219// CHECK-LABEL: func @should_fuse_live_out_arg_but_preserve_src_loop(%{{.*}}: memref<10xf32>) {
1220func.func @should_fuse_live_out_arg_but_preserve_src_loop(%arg0: memref<10xf32>) {
1221  %cf7 = arith.constant 7.0 : f32
1222
1223  affine.for %i0 = 0 to 10 {
1224    affine.store %cf7, %arg0[%i0] : memref<10xf32>
1225  }
1226  affine.for %i1 = 0 to 9 {
1227    %v0 = affine.load %arg0[%i1] : memref<10xf32>
1228  }
1229  // This tests that the loop nest '%i0' should not be removed after fusion
1230  // because it writes to memref argument '%arg0', and its read region
1231  // does not cover its write region (so fusion would shrink the write region
1232  // in the fused loop nest, so complete live out data region would not
1233  // be written).
1234  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1235  // CHECK-NEXT:    affine.store %{{.*}} : memref<10xf32>
1236  // CHECK-NEXT:  }
1237  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 9 {
1238  // CHECK-NEXT:    affine.store %{{.*}} : memref<1xf32>
1239  // CHECK-NEXT:    affine.load %{{.*}} : memref<1xf32>
1240  // CHECK-NEXT:  }
1241  // CHECK-NEXT:  return
1242  return
1243}
1244
1245// -----
1246
1247// CHECK-LABEL: func @should_fuse_live_out_arg(%{{.*}}: memref<10xf32>) {
1248func.func @should_fuse_live_out_arg(%arg0: memref<10xf32>) {
1249  %cf7 = arith.constant 7.0 : f32
1250
1251  affine.for %i0 = 0 to 10 {
1252    affine.store %cf7, %arg0[%i0] : memref<10xf32>
1253  }
1254  affine.for %i1 = 0 to 10 {
1255    %v0 = affine.load %arg0[%i1] : memref<10xf32>
1256  }
1257  // The read/write regions for memref '%{{.*}}' are the same for both
1258  // loops, so they should fuse.
1259
1260  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1261  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1262  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1263  // CHECK-NEXT:  }
1264  // CHECK-NEXT:  return
1265  return
1266}
1267
1268// -----
1269
1270// CHECK-LABEL: func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32>
1271func.func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32> {
1272  %cf7 = arith.constant 7.0 : f32
1273  %m = memref.alloc() : memref<10xf32>
1274  affine.for %i0 = 0 to 10 {
1275    affine.store %cf7, %m[%i0] : memref<10xf32>
1276  }
1277  affine.for %i1 = 0 to 9 {
1278    %v0 = affine.load %m[%i1] : memref<10xf32>
1279  }
1280  // This tests that the loop nest '%i0' should not be removed after fusion
1281  // because it writes to memref '%m', which is returned by the function, and
1282  // the '%i1' memory region does not cover '%i0' memory region.
1283
1284  // CHECK-DAG:   memref.alloc() : memref<1xf32>
1285  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1286  // CHECK-NEXT:    affine.store %{{.*}} : memref<10xf32>
1287  // CHECK-NEXT:  }
1288  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 9 {
1289  // CHECK-NEXT:    affine.store %{{.*}} : memref<1xf32>
1290  // CHECK-NEXT:    affine.load %{{.*}} : memref<1xf32>
1291  // CHECK-NEXT:  }
1292  // CHECK-NEXT:  return %{{.*}} : memref<10xf32>
1293  return %m : memref<10xf32>
1294}
1295// -----
1296
1297// This should fuse with the %in becoming a 1x1x1.
1298func.func @R3_to_R2_reshape() {
1299  %in = memref.alloc() : memref<2x3x16xi32>
1300
1301  %c0 = arith.constant 0 : index
1302
1303  affine.for %i0 = 0 to 2 {
1304    affine.for %i1 = 0 to 3 {
1305      affine.for %i2 = 0 to 16 {
1306        %val = "foo"(%i0, %i1, %i2) : (index, index, index) -> i32
1307        affine.store %val, %in[%i0, %i1, %i2] : memref<2x3x16xi32>
1308      }
1309    }
1310  }
1311
1312  affine.for %ii = 0 to 32 {
1313    affine.for %jj = 0 to 3 {
1314      %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 3 + d1)> (%ii, %jj)
1315      %idx = affine.apply affine_map<(d0) -> (d0 floordiv (3 * 16))> (%a0)
1316      %v = affine.load %in[%idx, %jj, %c0]
1317        : memref<2x3x16xi32>
1318    }
1319  }
1320  return
1321}
1322// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 3 + d1) floordiv 48)>
1323// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 3 + d1)>
1324// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 48)>
1325
1326// CHECK-LABEL: func @R3_to_R2_reshape()
1327// CHECK-DAG:    memref.alloc() : memref<1x1x1xi32>
1328// CHECK:        affine.for %{{.*}} = 0 to 32 {
1329// CHECK-NEXT:     affine.for %{{.*}} = 0 to 3 {
1330// CHECK-NEXT:      affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
1331// CHECK-NEXT:      "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> i32
1332// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
1333// CHECK-NEXT:      affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
1334// CHECK-NEXT:      affine.apply [[$MAP2]](%{{.*}})
1335// CHECK-NEXT:      affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
1336// CHECK-NEXT:    }
1337// CHECK-NEXT:  }
1338// CHECK-NEXT:  return
1339
1340// -----
1341
1342func.func @should_fuse_multi_output_producer() {
1343  %a = memref.alloc() : memref<10xf32>
1344  %b = memref.alloc() : memref<10xf32>
1345
1346  %cf7 = arith.constant 7.0 : f32
1347
1348  affine.for %i0 = 0 to 10 {
1349    affine.store %cf7, %a[%i0] : memref<10xf32>
1350    affine.store %cf7, %b[%i0] : memref<10xf32>
1351  }
1352  affine.for %i1 = 0 to 10 {
1353    %v0 = affine.load %a[%i1] : memref<10xf32>
1354    %v1 = affine.load %b[%i1] : memref<10xf32>
1355  }
1356
1357  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1358  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1359  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1360  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
1361  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
1362  // CHECK-NEXT:  }
1363  // CHECK-NEXT:  return
1364  return
1365}
1366
1367// -----
1368
1369// CHECK-LABEL: func @fusion_preventing_deps_on_middle_loop() {
1370func.func @fusion_preventing_deps_on_middle_loop() {
1371  %a = memref.alloc() : memref<10xf32>
1372  %b = memref.alloc() : memref<10xf32>
1373  %c = memref.alloc() : memref<10xf32>
1374
1375  %cf7 = arith.constant 7.0 : f32
1376
1377  affine.for %i0 = 0 to 10 {
1378    %v0 = affine.load %a[%i0] : memref<10xf32>
1379    affine.store %v0, %b[%i0] : memref<10xf32>
1380  }
1381  affine.for %i1 = 0 to 10 {
1382    affine.store %cf7, %a[%i1] : memref<10xf32>
1383    %v1 = affine.load %c[%i1] : memref<10xf32>
1384  }
1385  affine.for %i2 = 0 to 10 {
1386    %v2 = affine.load %b[%i2] : memref<10xf32>
1387    affine.store %v2, %c[%i2] : memref<10xf32>
1388  }
1389  // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
1390  // '%b', because of the WAR dep from '%i0' to '%i1' on memref '%a' and
1391  // because of the WAR dep from '%i1' to '%i2' on memref '%c'.
1392  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1393  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1394  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1395  // CHECK-NEXT:  }
1396  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1397  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1398  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1399  // CHECK-NEXT:  }
1400  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1401  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1402  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1403  // CHECK-NEXT:  }
1404  // CHECK-NEXT:  return
1405  return
1406}
1407
1408// -----
1409
1410// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
1411func.func @should_fuse_and_move_to_preserve_war_dep() {
1412  %a = memref.alloc() : memref<10xf32>
1413  %b = memref.alloc() : memref<10xf32>
1414  %c = memref.alloc() : memref<10xf32>
1415
1416  %cf7 = arith.constant 7.0 : f32
1417
1418  affine.for %i0 = 0 to 10 {
1419    %v0 = affine.load %b[%i0] : memref<10xf32>
1420    affine.store %v0, %a[%i0] : memref<10xf32>
1421  }
1422  affine.for %i1 = 0 to 3 {
1423    %v2 = affine.load %c[%i1] : memref<10xf32>
1424  }
1425  affine.for %i2 = 0 to 5 {
1426    affine.store %cf7, %b[%i2] : memref<10xf32>
1427  }
1428  affine.for %i3 = 0 to 10 {
1429    %v1 = affine.load %a[%i3] : memref<10xf32>
1430    affine.store %cf7, %c[%i3] : memref<10xf32>
1431  }
1432
1433  // Dependence graph:
1434  //
1435  //         %i0 ---------
1436  //               |     |
1437  //     --- %i1   | %b  | %a
1438  //     |         |     |
1439  //  %c |   %i2 <--     |
1440  //     |               |
1441  //     --> %i3 <--------
1442  //
1443  // It is possible to fuse loop '%i0' into '%i3' and preserve dependences
1444  // if the fused loop nest is inserted between loops '%i1' and '%i2'.
1445
1446  // CHECK-DAG:   memref.alloc() : memref<1xf32>
1447  // CHECK:       affine.for %{{.*}} = 0 to 3 {
1448  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1449  // CHECK-NEXT:  }
1450  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1451  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1452  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1453  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
1454  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1455  // CHECK-NEXT:  }
1456  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 5 {
1457  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1458  // CHECK-NEXT:  }
1459  // CHECK-NEXT:  return
1460  return
1461}
1462
1463// -----
1464
1465// CHECK-LABEL: func @fusion_preventing_dep_on_constant() {
1466func.func @fusion_preventing_dep_on_constant() {
1467  %a = memref.alloc() : memref<10xf32>
1468  %b = memref.alloc() : memref<10xf32>
1469  %c = memref.alloc() : memref<10xf32>
1470
1471  %cf7 = arith.constant 7.0 : f32
1472
1473  affine.for %i0 = 0 to 10 {
1474    %v0 = affine.load %b[%i0] : memref<10xf32>
1475    affine.store %cf7, %a[%i0] : memref<10xf32>
1476  }
1477  affine.for %i1 = 0 to 10 {
1478    affine.store %cf7, %b[%i1] : memref<10xf32>
1479  }
1480  %cf11 = arith.constant 11.0 : f32
1481  affine.for %i2 = 0 to 10 {
1482    %v2 = affine.load %a[%i2] : memref<10xf32>
1483    affine.store %cf11, %c[%i2] : memref<10xf32>
1484  }
1485  // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
1486  // '%a', because of the WAR dep from '%i0' to '%i1' on memref '%b' and
1487  // because of the SSA value dep from '%cf11' def to use in '%i2'.
1488  // CHECK:       affine.for %{{.*}} = 0 to 10 {
1489  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1490  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1491  // CHECK-NEXT:  }
1492  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1493  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1494  // CHECK-NEXT:  }
1495  // CHECK-NEXT:  %{{.*}} = arith.constant 1.100000e+01 : f32
1496  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1497  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1498  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1499  // CHECK-NEXT:  }
1500  // CHECK-NEXT:  return
1501  return
1502}
1503
1504// -----
1505
1506// CHECK-LABEL: func @should_fuse_and_preserve_dep_on_constant() {
1507func.func @should_fuse_and_preserve_dep_on_constant() {
1508  %a = memref.alloc() : memref<10xf32>
1509  %b = memref.alloc() : memref<10xf32>
1510  %c = memref.alloc() : memref<10xf32>
1511
1512  %cf7 = arith.constant 7.0 : f32
1513  %cf11 = arith.constant 11.0 : f32
1514  affine.for %i0 = 0 to 10 {
1515    %v0 = affine.load %b[%i0] : memref<10xf32>
1516    affine.store %cf7, %a[%i0] : memref<10xf32>
1517  }
1518  affine.for %i1 = 0 to 10 {
1519    affine.store %cf7, %b[%i1] : memref<10xf32>
1520  }
1521  affine.for %i2 = 0 to 10 {
1522    %v2 = affine.load %a[%i2] : memref<10xf32>
1523    affine.store %cf11, %c[%i2] : memref<10xf32>
1524  }
1525
1526  // Loops '%i0' and '%i2' can fuse along producer/consumer edge on memref
1527  // '%a', and preserve the WAR dep from '%i0' to '%i1' on memref '%b', and
1528  // the SSA value dep from '%cf11' def to use in '%i2'.
1529
1530  // CHECK:       arith.constant 1.100000e+01 : f32
1531  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1532  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1533  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1534  // CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
1535  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1536  // CHECK-NEXT:  }
1537  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
1538  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1539  // CHECK-NEXT:  }
1540  // CHECK-NEXT:  return
1541  return
1542}
1543
1544// -----
1545
1546// CHECK-LABEL: @producer_consumer_with_outmost_user
1547func.func @producer_consumer_with_outmost_user(%arg0 : f16) {
1548  %c0 = arith.constant 0 : index
1549  %src = memref.alloc() : memref<f16, 1>
1550  %dst = memref.alloc() : memref<f16>
1551  %tag = memref.alloc() : memref<1xi32>
1552  affine.for %arg1 = 4 to 6 {
1553    affine.for %arg2 = 0 to 1 {
1554      %0 = arith.addf %arg0, %arg0 : f16
1555      affine.store %0, %src[] : memref<f16, 1>
1556    }
1557    affine.for %arg3 = 0 to 1 {
1558      %0 = affine.load %src[] : memref<f16, 1>
1559    }
1560  }
1561  affine.dma_start %src[], %dst[], %tag[%c0], %c0 : memref<f16, 1>, memref<f16>, memref<1xi32>
1562  // CHECK:       %[[CST_INDEX:.*]] = arith.constant 0 : index
1563  // CHECK:       %[[DMA_SRC:.*]] = memref.alloc() : memref<f16, 1>
1564  // CHECK:       %[[DMA_DST:.*]] = memref.alloc() : memref<f16>
1565  // CHECK:       %[[DMA_TAG:.*]] = memref.alloc() : memref<1xi32>
1566  // CHECK:       affine.for %arg1 = 4 to 6
1567  // CHECK-NEXT:  affine.for %arg2 = 0 to 1
1568  // CHECK-NEXT:  %[[RESULT_ADD:.*]] = arith.addf %arg0, %arg0 : f16
1569  // CHECK-NEXT:  affine.store %[[RESULT_ADD]], %[[DMA_SRC]][] : memref<f16, 1>
1570  // CHECK-NEXT:  affine.load %[[DMA_SRC]][] : memref<f16, 1>
1571  // CHECK:       affine.dma_start %[[DMA_SRC]][], %[[DMA_DST]][], %[[DMA_TAG]][%[[CST_INDEX]]], %[[CST_INDEX]] : memref<f16, 1>, memref<f16>, memref<1xi32>
1572  // CHECK-NEXT:  return
1573  return
1574}
1575
1576// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir
1577
1578