xref: /llvm-project/mlir/test/Dialect/Affine/loop-fusion-3.mlir (revision a5985ca51dd7e0759d65fac9cb2b6a4448ebc404)
1// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s
2// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal}))' -split-input-file | FileCheck %s --check-prefix=MAXIMAL
3
4// Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir.
5// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
6// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir
7
8// -----
9
10// Test case from github bug 777.
11// CHECK-LABEL: func @mul_add_0
12func.func @mul_add_0(%arg0: memref<3x4xf32>, %arg1: memref<4x3xf32>, %arg2: memref<3x3xf32>, %arg3: memref<3x3xf32>) {
13  %cst = arith.constant 0.000000e+00 : f32
14  %0 = memref.alloc() : memref<3x3xf32>
15  affine.for %arg4 = 0 to 3 {
16    affine.for %arg5 = 0 to 3 {
17      affine.store %cst, %0[%arg4, %arg5] : memref<3x3xf32>
18    }
19  }
20  affine.for %arg4 = 0 to 3 {
21    affine.for %arg5 = 0 to 3 {
22      affine.for %arg6 = 0 to 4 {
23        %1 = affine.load %arg1[%arg6, %arg5] : memref<4x3xf32>
24        %2 = affine.load %arg0[%arg4, %arg6] : memref<3x4xf32>
25        %3 = arith.mulf %2, %1 : f32
26        %4 = affine.load %0[%arg4, %arg5] : memref<3x3xf32>
27        %5 = arith.addf %4, %3 : f32
28        affine.store %5, %0[%arg4, %arg5] : memref<3x3xf32>
29      }
30    }
31  }
32  affine.for %arg4 = 0 to 3 {
33    affine.for %arg5 = 0 to 3 {
34      %6 = affine.load %arg2[%arg4, %arg5] : memref<3x3xf32>
35      %7 = affine.load %0[%arg4, %arg5] : memref<3x3xf32>
36      %8 = arith.addf %7, %6 : f32
37      affine.store %8, %arg3[%arg4, %arg5] : memref<3x3xf32>
38    }
39  }
40  // CHECK:      affine.for %[[i0:.*]] = 0 to 3 {
41  // CHECK-NEXT:   affine.for %[[i1:.*]] = 0 to 3 {
42  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
43  // CHECK-NEXT:     affine.for %[[i2:.*]] = 0 to 4 {
44  // CHECK-NEXT:       affine.load %{{.*}}[%[[i2]], %[[i1]]] : memref<4x3xf32>
45  // CHECK-NEXT:       affine.load %{{.*}}[%[[i0]], %[[i2]]] : memref<3x4xf32>
46  // CHECK-NEXT:       arith.mulf %{{.*}}, %{{.*}} : f32
47  // CHECK-NEXT:       affine.load %{{.*}}[0, 0] : memref<1x1xf32>
48  // CHECK-NEXT:       arith.addf %{{.*}}, %{{.*}} : f32
49  // CHECK-NEXT:       affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
50  // CHECK-NEXT:     }
51  // CHECK-NEXT:     affine.load %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32>
52  // CHECK-NEXT:     affine.load %{{.*}}[0, 0] : memref<1x1xf32>
53  // CHECK-NEXT:     arith.addf %{{.*}}, %{{.*}} : f32
54  // CHECK-NEXT:     affine.store %{{.*}}, %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32>
55  // CHECK-NEXT:   }
56  // CHECK-NEXT: }
57  // CHECK-NEXT: return
58  return
59}
60
61// -----
62
63// Verify that 'fuseProducerConsumerNodes' fuse a producer loop with a store
64// that has multiple outgoing edges.
65
66// CHECK-LABEL: func @should_fuse_multi_outgoing_edge_store_producer
67func.func @should_fuse_multi_outgoing_edge_store_producer(%a : memref<1xf32>) {
68  %cst = arith.constant 0.000000e+00 : f32
69  affine.for %arg0 = 0 to 1 {
70    affine.store %cst, %a[%arg0] : memref<1xf32>
71  }
72
73  affine.for %arg0 = 0 to 1 {
74    %0 = affine.load %a[%arg0] : memref<1xf32>
75  }
76
77  affine.for %arg0 = 0 to 1 {
78    %0 = affine.load %a[%arg0] : memref<1xf32>
79  }
80  // CHECK:      affine.for %{{.*}} = 0 to 1 {
81  // CHECK-NEXT:   affine.store
82  // CHECK-NEXT:   affine.load
83  // CHECK-NEXT:   affine.load
84  // CHECK-NEXT: }
85
86  return
87}
88
89// -----
90
91// Verify that 'fuseProducerConsumerNodes' fuses a producer loop that: 1) has
92// multiple outgoing edges, 2) producer store has a single outgoing edge.
93// Sibling loop fusion should not fuse any of these loops due to
94// dependencies on external memrefs '%a' and '%b'.
95
96// CHECK-LABEL: func @should_fuse_producer_with_multi_outgoing_edges
97func.func @should_fuse_producer_with_multi_outgoing_edges(%a : memref<1xf32>, %b : memref<1xf32>) {
98  %cst = arith.constant 0.000000e+00 : f32
99  affine.for %arg0 = 0 to 1 {
100    %0 = affine.load %a[%arg0] : memref<1xf32>
101    affine.store %cst, %b[%arg0] : memref<1xf32>
102  }
103
104  affine.for %arg0 = 0 to 1 {
105    affine.store %cst, %a[%arg0] : memref<1xf32>
106    %1 = affine.load %b[%arg0] : memref<1xf32>
107  }
108  // CHECK: affine.for %{{.*}} = 0 to 1
109  // CHECK-NEXT: affine.load %[[A:.*]][{{.*}}]
110  // CHECK-NEXT: affine.store %{{.*}}, %[[B:.*]][{{.*}}]
111  // CHECK-NEXT: affine.store %{{.*}}, %[[A]]
112  // CHECK-NEXT: affine.load %[[B]]
113  // CHECK-NOT: affine.for %{{.*}}
114  // CHECK: return
115  return
116}
117
118// MAXIMAL-LABEL: func @reshape_into_matmul
119func.func @reshape_into_matmul(%lhs : memref<1024x1024xf32>,
120              %R: memref<16x64x1024xf32>, %out: memref<1024x1024xf32>) {
121  %rhs = memref.alloc() :  memref<1024x1024xf32>
122
123  // Reshape from 3-d to 2-d.
124  affine.for %i0 = 0 to 16 {
125    affine.for %i1 = 0 to 64 {
126      affine.for %k = 0 to 1024 {
127        %v = affine.load %R[%i0, %i1, %k] : memref<16x64x1024xf32>
128        affine.store %v, %rhs[64*%i0 + %i1, %k] : memref<1024x1024xf32>
129      }
130    }
131  }
132
133  // Matmul.
134  affine.for %i = 0 to 1024 {
135    affine.for %j = 0 to 1024 {
136      affine.for %k = 0 to 1024 {
137        %0 = affine.load %rhs[%k, %j] : memref<1024x1024xf32>
138        %1 = affine.load %lhs[%i, %k] : memref<1024x1024xf32>
139        %2 = arith.mulf %1, %0 : f32
140        %3 = affine.load %out[%i, %j] : memref<1024x1024xf32>
141        %4 = arith.addf %3, %2 : f32
142        affine.store %4, %out[%i, %j] : memref<1024x1024xf32>
143      }
144    }
145  }
146  return
147}
148// MAXIMAL-NEXT: memref.alloc
149// MAXIMAL-NEXT: affine.for
150// MAXIMAL-NEXT:   affine.for
151// MAXIMAL-NEXT:     affine.for
152// MAXIMAL-NOT:      affine.for
153// MAXIMAL:      return
154
155// -----
156
157// CHECK-LABEL: func @vector_loop
158func.func @vector_loop(%a : memref<10x20xf32>, %b : memref<10x20xf32>,
159                  %c : memref<10x20xf32>) {
160  affine.for %j = 0 to 10 {
161    affine.for %i = 0 to 5 {
162      %ld0 = affine.vector_load %a[%j, %i*4] : memref<10x20xf32>, vector<4xf32>
163      affine.vector_store %ld0, %b[%j, %i*4] : memref<10x20xf32>, vector<4xf32>
164    }
165  }
166
167  affine.for %j = 0 to 10 {
168    affine.for %i = 0 to 5 {
169      %ld0 = affine.vector_load %b[%j, %i*4] : memref<10x20xf32>, vector<4xf32>
170      affine.vector_store %ld0, %c[%j, %i*4] : memref<10x20xf32>, vector<4xf32>
171    }
172  }
173
174  return
175}
176// CHECK:      affine.for
177// CHECK-NEXT:   affine.for
178// CHECK-NEXT:     affine.vector_load
179// CHECK-NEXT:     affine.vector_store
180// CHECK-NEXT:     affine.vector_load
181// CHECK-NEXT:     affine.vector_store
182// CHECK-NOT:  affine.for
183
184// -----
185
186// CHECK-LABEL: func @multi_outgoing_edges
187func.func @multi_outgoing_edges(%in0 : memref<32xf32>,
188                      %in1 : memref<32xf32>) {
189  affine.for %d = 0 to 32 {
190    %lhs = affine.load %in0[%d] : memref<32xf32>
191    %rhs = affine.load %in1[%d] : memref<32xf32>
192    %add = arith.addf %lhs, %rhs : f32
193    affine.store %add, %in0[%d] : memref<32xf32>
194  }
195  affine.for %d = 0 to 32 {
196    %lhs = affine.load %in0[%d] : memref<32xf32>
197    %rhs = affine.load %in1[%d] : memref<32xf32>
198    %add = arith.subf %lhs, %rhs : f32
199    affine.store %add, %in0[%d] : memref<32xf32>
200  }
201  affine.for %d = 0 to 32 {
202    %lhs = affine.load %in0[%d] : memref<32xf32>
203    %rhs = affine.load %in1[%d] : memref<32xf32>
204    %add = arith.mulf %lhs, %rhs : f32
205    affine.store %add, %in0[%d] : memref<32xf32>
206  }
207  affine.for %d = 0 to 32 {
208    %lhs = affine.load %in0[%d] : memref<32xf32>
209    %rhs = affine.load %in1[%d] : memref<32xf32>
210    %add = arith.divf %lhs, %rhs : f32
211    affine.store %add, %in0[%d] : memref<32xf32>
212  }
213  return
214}
215
216// CHECK:      affine.for
217// CHECK-NOT:  affine.for
218// CHECK:        arith.addf
219// CHECK-NOT:  affine.for
220// CHECK:        arith.subf
221// CHECK-NOT:  affine.for
222// CHECK:        arith.mulf
223// CHECK-NOT:  affine.for
224// CHECK:        arith.divf
225
226// -----
227
228// Test fusion when dynamically shaped memrefs are used with constant trip count loops.
229
230// CHECK-LABEL: func @calc
231func.func @calc(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %len: index) {
232  %c1 = arith.constant 1 : index
233  %1 = memref.alloc(%len) : memref<?xf32>
234  affine.for %arg4 = 1 to 10 {
235    %7 = affine.load %arg0[%arg4] : memref<?xf32>
236    %8 = affine.load %arg1[%arg4] : memref<?xf32>
237    %9 = arith.addf %7, %8 : f32
238    affine.store %9, %1[%arg4] : memref<?xf32>
239  }
240  affine.for %arg4 = 1 to 10 {
241    %7 = affine.load %1[%arg4] : memref<?xf32>
242    %8 = affine.load %arg1[%arg4] : memref<?xf32>
243    %9 = arith.mulf %7, %8 : f32
244    affine.store %9, %arg2[%arg4] : memref<?xf32>
245  }
246  return
247}
248// CHECK:       memref.alloc() : memref<1xf32>
249// CHECK:       affine.for %arg{{.*}} = 1 to 10 {
250// CHECK-NEXT:    affine.load %arg{{.*}}
251// CHECK-NEXT:    affine.load %arg{{.*}}
252// CHECK-NEXT:    arith.addf
253// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
254// CHECK-NEXT:    affine.load %{{.*}}[0] : memref<1xf32>
255// CHECK-NEXT:    affine.load %arg{{.*}}[%arg{{.*}}] : memref<?xf32>
256// CHECK-NEXT:    arith.mulf
257// CHECK-NEXT:    affine.store %{{.*}}, %arg{{.*}}[%arg{{.*}}] : memref<?xf32>
258// CHECK-NEXT:  }
259// CHECK-NEXT:  return
260
261// -----
262
263// CHECK-LABEL: func @should_not_fuse_since_non_affine_users
264func.func @should_not_fuse_since_non_affine_users(%in0 : memref<32xf32>,
265                      %in1 : memref<32xf32>) {
266  affine.for %d = 0 to 32 {
267    %lhs = affine.load %in0[%d] : memref<32xf32>
268    %rhs = affine.load %in1[%d] : memref<32xf32>
269    %add = arith.addf %lhs, %rhs : f32
270    affine.store %add, %in0[%d] : memref<32xf32>
271  }
272  affine.for %d = 0 to 32 {
273    %lhs = memref.load %in0[%d] : memref<32xf32>
274    %rhs = memref.load %in1[%d] : memref<32xf32>
275    %add = arith.subf %lhs, %rhs : f32
276    memref.store %add, %in0[%d] : memref<32xf32>
277  }
278  affine.for %d = 0 to 32 {
279    %lhs = affine.load %in0[%d] : memref<32xf32>
280    %rhs = affine.load %in1[%d] : memref<32xf32>
281    %add = arith.mulf %lhs, %rhs : f32
282    affine.store %add, %in0[%d] : memref<32xf32>
283  }
284  return
285}
286
287// CHECK:  affine.for
288// CHECK:    arith.addf
289// CHECK:  affine.for
290// CHECK:    arith.subf
291// CHECK:  affine.for
292// CHECK:    arith.mulf
293
294// -----
295
296// CHECK-LABEL: func @should_not_fuse_since_top_level_non_affine_users
297func.func @should_not_fuse_since_top_level_non_affine_users(%in0 : memref<32xf32>,
298                      %in1 : memref<32xf32>) {
299  %sum = memref.alloc() : memref<f32>
300  affine.for %d = 0 to 32 {
301    %lhs = affine.load %in0[%d] : memref<32xf32>
302    %rhs = affine.load %in1[%d] : memref<32xf32>
303    %add = arith.addf %lhs, %rhs : f32
304    memref.store %add, %sum[] : memref<f32>
305    affine.store %add, %in0[%d] : memref<32xf32>
306  }
307  %load_sum = memref.load %sum[] : memref<f32>
308  affine.for %d = 0 to 32 {
309    %lhs = affine.load %in0[%d] : memref<32xf32>
310    %rhs = affine.load %in1[%d] : memref<32xf32>
311    %add = arith.mulf %lhs, %rhs : f32
312    %sub = arith.subf %add, %load_sum: f32
313    affine.store %sub, %in0[%d] : memref<32xf32>
314  }
315  memref.dealloc %sum : memref<f32>
316  return
317}
318
319// CHECK:  affine.for
320// CHECK:    arith.addf
321// CHECK:  affine.for
322// CHECK:    arith.mulf
323// CHECK:    arith.subf
324
325// -----
326
327// CHECK-LABEL: func @should_not_fuse_since_top_level_non_affine_mem_write_users
328func.func @should_not_fuse_since_top_level_non_affine_mem_write_users(
329    %in0 : memref<32xf32>, %in1 : memref<32xf32>) {
330  %c0 = arith.constant 0 : index
331  %cst_0 = arith.constant 0.000000e+00 : f32
332
333  affine.for %d = 0 to 32 {
334    %lhs = affine.load %in0[%d] : memref<32xf32>
335    %rhs = affine.load %in1[%d] : memref<32xf32>
336    %add = arith.addf %lhs, %rhs : f32
337    affine.store %add, %in0[%d] : memref<32xf32>
338  }
339  memref.store %cst_0, %in0[%c0] : memref<32xf32>
340  affine.for %d = 0 to 32 {
341    %lhs = affine.load %in0[%d] : memref<32xf32>
342    %rhs = affine.load %in1[%d] : memref<32xf32>
343    %add = arith.addf %lhs, %rhs: f32
344    affine.store %add, %in0[%d] : memref<32xf32>
345  }
346  return
347}
348
349// CHECK:  affine.for
350// CHECK:    arith.addf
351// CHECK:  affine.for
352// CHECK:    arith.addf
353
354// -----
355
356// MAXIMAL-LABEL: func @fuse_minor_affine_map
357func.func @fuse_minor_affine_map(%in: memref<128xf32>, %out: memref<20x512xf32>) {
358  %tmp = memref.alloc() : memref<128xf32>
359
360  affine.for %arg4 = 0 to 128 {
361    %ld = affine.load %in[%arg4] : memref<128xf32>
362    affine.store %ld, %tmp[%arg4] : memref<128xf32>
363  }
364
365  affine.for %arg3 = 0 to 20 {
366    affine.for %arg4 = 0 to 512 {
367      %ld = affine.load %tmp[%arg4 mod 128] : memref<128xf32>
368      affine.store %ld, %out[%arg3, %arg4] : memref<20x512xf32>
369    }
370  }
371
372  return
373}
374
375// TODO: The size of the private memref is not properly computed in the presence
376// of the 'mod' operation. It should be memref<1xf32> instead of
377// memref<128xf32>: https://bugs.llvm.org/show_bug.cgi?id=46973
378// MAXIMAL:       memref.alloc() : memref<128xf32>
379// MAXIMAL:       affine.for
380// MAXIMAL-NEXT:    affine.for
381// MAXIMAL-NOT:   affine.for
382// MAXIMAL:       return
383
384// -----
385
386// CHECK-LABEL: func @should_fuse_multi_store_producer_and_privatize_memfefs
387func.func @should_fuse_multi_store_producer_and_privatize_memfefs() {
388  %a = memref.alloc() : memref<10xf32>
389  %b = memref.alloc() : memref<10xf32>
390  %c = memref.alloc() : memref<10xf32>
391  %cst = arith.constant 0.000000e+00 : f32
392  affine.for %arg0 = 0 to 10 {
393    affine.store %cst, %a[%arg0] : memref<10xf32>
394    affine.store %cst, %b[%arg0] : memref<10xf32>
395    affine.store %cst, %c[%arg0] : memref<10xf32>
396    %0 = affine.load %c[%arg0] : memref<10xf32>
397  }
398
399  affine.for %arg0 = 0 to 10 {
400    %0 = affine.load %a[%arg0] : memref<10xf32>
401  }
402
403  affine.for %arg0 = 0 to 10 {
404    %0 = affine.load %b[%arg0] : memref<10xf32>
405  }
406
407	// All the memrefs should be privatized except '%c', which is not involved in
408  // the producer-consumer fusion.
409  // CHECK:      affine.for %{{.*}} = 0 to 10 {
410  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
411  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
412  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
413  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
414  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
415  // CHECK-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
416  // CHECK-NEXT: }
417
418  return
419}
420
421
422func.func @should_fuse_multi_store_producer_with_escaping_memrefs_and_remove_src(
423    %a : memref<10xf32>, %b : memref<10xf32>) {
424  %cst = arith.constant 0.000000e+00 : f32
425  affine.for %i0 = 0 to 10 {
426    affine.store %cst, %a[%i0] : memref<10xf32>
427    affine.store %cst, %b[%i0] : memref<10xf32>
428  }
429
430  affine.for %i1 = 0 to 10 {
431    %0 = affine.load %a[%i1] : memref<10xf32>
432  }
433
434  affine.for %i2 = 0 to 10 {
435    %0 = affine.load %b[%i2] : memref<10xf32>
436  }
437
438	// Producer loop '%i0' should be removed after fusion since fusion is maximal.
439  // No memref should be privatized since they escape the function, and the
440  // producer is removed after fusion.
441  // CHECK:       affine.for %{{.*}} = 0 to 10 {
442  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
443  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
444  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
445  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
446  // CHECK-NEXT:  }
447  // CHECK-NOT:   affine.for
448
449  return
450}
451
452// -----
453
454func.func @should_fuse_multi_store_producer_with_escaping_memrefs_and_preserve_src(
455    %a : memref<10xf32>, %b : memref<10xf32>) {
456  %cst = arith.constant 0.000000e+00 : f32
457  affine.for %i0 = 0 to 10 {
458    affine.store %cst, %a[%i0] : memref<10xf32>
459    affine.store %cst, %b[%i0] : memref<10xf32>
460  }
461
462  affine.for %i1 = 0 to 5 {
463    %0 = affine.load %a[%i1] : memref<10xf32>
464  }
465
466  affine.for %i2 = 0 to 10 {
467    %0 = affine.load %b[%i2] : memref<10xf32>
468  }
469
470	// Loops '%i0' and '%i2' should be fused first and '%i0' should be removed
471  // since fusion is maximal. Then the fused loop and '%i1' should be fused
472  // and the fused loop shouldn't be removed since fusion is not maximal.
473  // CHECK:       affine.for %{{.*}} = 0 to 10 {
474  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
475  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
476  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
477  // CHECK-NEXT:  }
478  // CHECK:       affine.for %{{.*}} = 0 to 5 {
479  // CHECK-NEXT:    affine.store %{{.*}} : memref<1xf32>
480  // CHECK-NEXT:    affine.store %{{.*}} : memref<10xf32>
481  // CHECK-NEXT:    affine.load %{{.*}} : memref<10xf32>
482  // CHECK-NEXT:    affine.load %{{.*}} : memref<1xf32>
483  // CHECK-NEXT:  }
484  // CHECK-NOT:   affine.for
485
486  return
487}
488
489
490func.func @should_not_fuse_due_to_dealloc(%arg0: memref<16xf32>){
491  %A = memref.alloc() : memref<16xf32>
492  %C = memref.alloc() : memref<16xf32>
493  %cst_1 = arith.constant 1.000000e+00 : f32
494  affine.for %arg1 = 0 to 16 {
495    %a = affine.load %arg0[%arg1] : memref<16xf32>
496    affine.store %a, %A[%arg1] : memref<16xf32>
497    affine.store %a, %C[%arg1] : memref<16xf32>
498  }
499  memref.dealloc %C : memref<16xf32>
500  %B = memref.alloc() : memref<16xf32>
501  affine.for %arg1 = 0 to 16 {
502    %a = affine.load %A[%arg1] : memref<16xf32>
503    %b = arith.addf %cst_1, %a : f32
504    affine.store %b, %B[%arg1] : memref<16xf32>
505  }
506  memref.dealloc %A : memref<16xf32>
507  return
508}
509// CHECK-LABEL: func @should_not_fuse_due_to_dealloc
510// CHECK:         affine.for
511// CHECK-NEXT:      affine.load
512// CHECK-NEXT:      affine.store
513// CHECK-NEXT:      affine.store
514// CHECK:         memref.dealloc
515// CHECK:         affine.for
516// CHECK-NEXT:      affine.load
517// CHECK-NEXT:      arith.addf
518// CHECK-NEXT:      affine.store
519
520// -----
521
522// CHECK-LABEL: func @should_fuse_defining_node_has_no_dependence_from_source_node
523func.func @should_fuse_defining_node_has_no_dependence_from_source_node(
524    %a : memref<10xf32>, %b : memref<f32>) -> () {
525  affine.for %i0 = 0 to 10 {
526    %0 = affine.load %b[] : memref<f32>
527    affine.store %0, %a[%i0] : memref<10xf32>
528  }
529  %0 = affine.load %b[] : memref<f32>
530  affine.for %i1 = 0 to 10 {
531    %1 = affine.load %a[%i1] : memref<10xf32>
532    %2 = arith.divf %0, %1 : f32
533  }
534
535  // Loops '%i0' and '%i1' should be fused even though there is a defining node
536  // between the loops. It is because the node has no dependence from '%i0'.
537  // CHECK:       affine.load %{{.*}}[] : memref<f32>
538  // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
539  // CHECK-NEXT:    affine.load %{{.*}}[] : memref<f32>
540  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
541  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
542  // CHECK-NEXT:    arith.divf
543  // CHECK-NEXT:  }
544  // CHECK-NOT:   affine.for
545  return
546}
547
548// -----
549
550// CHECK-LABEL: func @should_not_fuse_defining_node_has_dependence_from_source_loop
551func.func @should_not_fuse_defining_node_has_dependence_from_source_loop(
552    %a : memref<10xf32>, %b : memref<f32>) -> () {
553  %cst = arith.constant 0.000000e+00 : f32
554  affine.for %i0 = 0 to 10 {
555    affine.store %cst, %b[] : memref<f32>
556    affine.store %cst, %a[%i0] : memref<10xf32>
557  }
558  %0 = affine.load %b[] : memref<f32>
559  affine.for %i1 = 0 to 10 {
560    %1 = affine.load %a[%i1] : memref<10xf32>
561    %2 = arith.divf %0, %1 : f32
562  }
563
564  // Loops '%i0' and '%i1' should not be fused because the defining node of '%0'
565  // used in '%i1' has dependence from loop '%i0'.
566  // CHECK:       affine.for %{{.*}} = 0 to 10 {
567  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[] : memref<f32>
568  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
569  // CHECK-NEXT:  }
570  // CHECK-NEXT:  affine.load %{{.*}}[] : memref<f32>
571  // CHECK:       affine.for %{{.*}} = 0 to 10 {
572  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
573  // CHECK-NEXT:    arith.divf
574  // CHECK-NEXT:  }
575  return
576}
577
578// -----
579
580// CHECK-LABEL: func @should_not_fuse_defining_node_has_transitive_dependence_from_source_loop
581func.func @should_not_fuse_defining_node_has_transitive_dependence_from_source_loop(
582    %a : memref<10xf32>, %b : memref<10xf32>, %c : memref<f32>) -> () {
583  %cst = arith.constant 0.000000e+00 : f32
584  affine.for %i0 = 0 to 10 {
585    affine.store %cst, %a[%i0] : memref<10xf32>
586    affine.store %cst, %b[%i0] : memref<10xf32>
587  }
588  affine.for %i1 = 0 to 10 {
589    %1 = affine.load %b[%i1] : memref<10xf32>
590    affine.store %1, %c[] : memref<f32>
591  }
592  %0 = affine.load %c[] : memref<f32>
593  affine.for %i2 = 0 to 10 {
594    %1 = affine.load %a[%i2] : memref<10xf32>
595    %2 = arith.divf %0, %1 : f32
596  }
597
598	// When loops '%i0' and '%i2' are evaluated first, they should not be
599  // fused. The defining node of '%0' in loop '%i2' has transitive dependence
600  // from loop '%i0'. After that, loops '%i0' and '%i1' are evaluated, and they
601  // will be fused as usual.
602  // CHECK:       affine.for %{{.*}} = 0 to 10 {
603  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
604  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
605  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
606  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[] : memref<f32>
607  // CHECK-NEXT:  }
608  // CHECK-NEXT:  affine.load %{{.*}}[] : memref<f32>
609  // CHECK:       affine.for %{{.*}} = 0 to 10 {
610  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
611  // CHECK-NEXT:    arith.divf
612  // CHECK-NEXT:  }
613  // CHECK-NOT:   affine.for
614  return
615}
616
617// -----
618
619// CHECK-LABEL: func @should_not_fuse_dest_loop_nest_return_value
620func.func @should_not_fuse_dest_loop_nest_return_value(
621    %a : memref<10xf32>) -> () {
622  %cst = arith.constant 0.000000e+00 : f32
623  affine.for %i0 = 0 to 10 {
624    affine.store %cst, %a[%i0] : memref<10xf32>
625  }
626  %b = affine.for %i1 = 0 to 10 step 2 iter_args(%b_iter = %cst) -> f32 {
627    %load_a = affine.load %a[%i1] : memref<10xf32>
628    affine.yield %load_a: f32
629  }
630
631  // CHECK:       affine.for %{{.*}} = 0 to 10 {
632  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
633  // CHECK-NEXT:  }
634  // CHECK:       affine.for %{{.*}} = 0 to 10 step 2 iter_args(%{{.*}} = %{{.*}}) -> (f32) {
635  // CHECK-NEXT:    affine.load
636  // CHECK-NEXT:    affine.yield
637  // CHECK-NEXT:  }
638
639  return
640}
641
642// -----
643
644// CHECK-LABEL: func @should_not_fuse_src_loop_nest_return_value
645func.func @should_not_fuse_src_loop_nest_return_value(
646    %a : memref<10xf32>) -> () {
647  %cst = arith.constant 1.000000e+00 : f32
648  %b = affine.for %i = 0 to 10 step 2 iter_args(%b_iter = %cst) -> f32 {
649    %c = arith.addf %b_iter, %b_iter : f32
650    affine.store %c, %a[%i] : memref<10xf32>
651    affine.yield %c: f32
652  }
653  affine.for %i1 = 0 to 10 {
654    %1 = affine.load %a[%i1] : memref<10xf32>
655  }
656
657  // CHECK:       %{{.*}} = affine.for %{{.*}} = 0 to 10 step 2 iter_args(%{{.*}} = %{{.*}}) -> (f32) {
658  // CHECK-NEXT:    %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
659  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
660  // CHECK-NEXT:    affine.yield %{{.*}} : f32
661  // CHECK-NEXT:  }
662  // CHECK:       affine.for %{{.*}} = 0 to 10 {
663  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
664  // CHECK-NEXT:  }
665
666  return
667}
668
669// -----
670
671func.func private @some_function(memref<16xf32>)
672func.func @call_op_prevents_fusion(%arg0: memref<16xf32>){
673  %A = memref.alloc() : memref<16xf32>
674  %cst_1 = arith.constant 1.000000e+00 : f32
675  affine.for %arg1 = 0 to 16 {
676    %a = affine.load %arg0[%arg1] : memref<16xf32>
677    affine.store %a, %A[%arg1] : memref<16xf32>
678  }
679  call @some_function(%A) : (memref<16xf32>) -> ()
680  %B = memref.alloc() : memref<16xf32>
681  affine.for %arg1 = 0 to 16 {
682    %a = affine.load %A[%arg1] : memref<16xf32>
683    %b = arith.addf %cst_1, %a : f32
684    affine.store %b, %B[%arg1] : memref<16xf32>
685  }
686  return
687}
688// CHECK-LABEL: func @call_op_prevents_fusion
689// CHECK:         affine.for
690// CHECK-NEXT:      affine.load
691// CHECK-NEXT:      affine.store
692// CHECK:         call
693// CHECK:         affine.for
694// CHECK-NEXT:      affine.load
695// CHECK-NEXT:      arith.addf
696// CHECK-NEXT:      affine.store
697
698// -----
699
700func.func private @some_function()
701func.func @call_op_does_not_prevent_fusion(%arg0: memref<16xf32>){
702  %A = memref.alloc() : memref<16xf32>
703  %cst_1 = arith.constant 1.000000e+00 : f32
704  affine.for %arg1 = 0 to 16 {
705    %a = affine.load %arg0[%arg1] : memref<16xf32>
706    affine.store %a, %A[%arg1] : memref<16xf32>
707  }
708  call @some_function() : () -> ()
709  %B = memref.alloc() : memref<16xf32>
710  affine.for %arg1 = 0 to 16 {
711    %a = affine.load %A[%arg1] : memref<16xf32>
712    %b = arith.addf %cst_1, %a : f32
713    affine.store %b, %B[%arg1] : memref<16xf32>
714  }
715  return
716}
717// CHECK-LABEL: func @call_op_does_not_prevent_fusion
718// CHECK:         affine.for
719// CHECK-NOT:     affine.for
720
721// -----
722
723// Test for source that writes to an escaping memref and has two consumers.
724// Fusion should create private memrefs in place of `%arg0` since the source is
725// not to be removed after fusion and the destinations do not write to `%arg0`.
726// This should enable both the consumers to benefit from fusion, which would not
727// be possible if private memrefs were not created.
728func.func @should_fuse_with_both_consumers_separately(%arg0: memref<10xf32>) {
729  %cf7 = arith.constant 7.0 : f32
730  affine.for %i0 = 0 to 10 {
731    affine.store %cf7, %arg0[%i0] : memref<10xf32>
732  }
733  affine.for %i1 = 0 to 7 {
734    %v0 = affine.load %arg0[%i1] : memref<10xf32>
735  }
736  affine.for %i1 = 5 to 9 {
737    %v0 = affine.load %arg0[%i1] : memref<10xf32>
738  }
739  return
740}
741
742// CHECK-LABEL: func @should_fuse_with_both_consumers_separately
743// CHECK:         affine.for
744// CHECK-NEXT:      affine.store
745// CHECK:         affine.for
746// CHECK-NEXT:      affine.store
747// CHECK-NEXT:      affine.load
748// CHECK:         affine.for
749// CHECK-NEXT:      affine.store
750// CHECK-NEXT:      affine.load
751
752// -----
753
754// Fusion is avoided when the slice computed is invalid. Comments below describe
755// incorrect backward slice computation. Similar logic applies for forward slice
756// as well.
757func.func @no_fusion_cannot_compute_valid_slice() {
758  %A = memref.alloc() : memref<5xf32>
759  %B = memref.alloc() : memref<6xf32>
760  %C = memref.alloc() : memref<5xf32>
761  %cst = arith.constant 0. : f32
762
763  affine.for %arg0 = 0 to 5 {
764    %a = affine.load %A[%arg0] : memref<5xf32>
765    affine.store %a, %B[%arg0 + 1] : memref<6xf32>
766  }
767
768  affine.for %arg0 = 0 to 5 {
769    // Backward slice computed will be:
770    // slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0)
771    // loop bounds: [(d0) -> (d0 - 1), (d0) -> (d0)] )
772
773    // Resulting fusion would be as below. It is easy to note the out-of-bounds
774    // access by 'affine.load'.
775
776    // #map0 = affine_map<(d0) -> (d0 - 1)>
777    // #map1 = affine_map<(d0) -> (d0)>
778    // affine.for %arg1 = #map0(%arg0) to #map1(%arg0) {
779    //   %5 = affine.load %1[%arg1] : memref<5xf32>
780    //   ...
781    //   ...
782    // }
783
784    %a = affine.load %B[%arg0] : memref<6xf32>
785    %b = arith.mulf %a, %cst : f32
786    affine.store %b, %C[%arg0] : memref<5xf32>
787  }
788  return
789}
790// CHECK-LABEL: func @no_fusion_cannot_compute_valid_slice
791// CHECK:         affine.for
792// CHECK-NEXT:      affine.load
793// CHECK-NEXT:      affine.store
794// CHECK:         affine.for
795// CHECK-NEXT:      affine.load
796// CHECK-NEXT:      arith.mulf
797// CHECK-NEXT:      affine.store
798
799// MAXIMAL-LABEL:   func @reduce_add_f32_f32(
800func.func @reduce_add_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) {
801  %cst_0 = arith.constant 0.000000e+00 : f32
802  %cst_1 = arith.constant 1.000000e+00 : f32
803  %0 = memref.alloca() : memref<f32, 1>
804  %1 = memref.alloca() : memref<f32, 1>
805  affine.for %arg3 = 0 to 1 {
806    affine.for %arg4 = 0 to 64 {
807      %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
808        %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
809        %5 = arith.addf %prevAccum, %4 : f32
810        affine.yield %5 : f32
811      }
812      %accum_dbl = arith.addf %accum, %accum : f32
813      affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
814    }
815  }
816  affine.for %arg3 = 0 to 1 {
817    affine.for %arg4 = 0 to 64 {
818      %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_1) -> f32 {
819        %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
820        %5 = arith.mulf %prevAccum, %4 : f32
821        affine.yield %5 : f32
822      }
823      %accum_sqr = arith.mulf %accum, %accum : f32
824      affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
825    }
826  }
827  return
828}
829// The two loops here get maximally sibling-fused at the innermost
830// insertion point. Test checks  if the innermost reduction loop of the fused loop
831// gets promoted into its outerloop.
832// MAXIMAL-SAME:                             %[[arg_0:.*]]: memref<64x64xf32, 1>,
833// MAXIMAL-SAME:                             %[[arg_1:.*]]: memref<1x64xf32, 1>,
834// MAXIMAL-SAME:                             %[[arg_2:.*]]: memref<1x64xf32, 1>) {
835// MAXIMAL:             %[[cst:.*]] = arith.constant 0 : index
836// MAXIMAL-NEXT:        %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
837// MAXIMAL-NEXT:        %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
838// MAXIMAL:             affine.for %[[idx_0:.*]] = 0 to 1 {
839// MAXIMAL-NEXT:          affine.for %[[idx_1:.*]] = 0 to 64 {
840// MAXIMAL-NEXT:            %[[results:.*]]:2 = affine.for %[[idx_2:.*]] = 0 to 64 iter_args(%[[iter_0:.*]] = %[[cst_1]], %[[iter_1:.*]] = %[[cst_0]]) -> (f32, f32) {
841// MAXIMAL-NEXT:              %[[val_0:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1>
842// MAXIMAL-NEXT:              %[[reduc_0:.*]] = arith.addf %[[iter_1]], %[[val_0]] : f32
843// MAXIMAL-NEXT:              %[[val_1:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1>
844// MAXIMAL-NEXT:              %[[reduc_1:.*]] = arith.mulf %[[iter_0]], %[[val_1]] : f32
845// MAXIMAL-NEXT:              affine.yield %[[reduc_1]], %[[reduc_0]] : f32, f32
846// MAXIMAL-NEXT:            }
847// MAXIMAL-NEXT:            %[[reduc_0_dbl:.*]] = arith.addf %[[results:.*]]#1, %[[results]]#1 : f32
848// MAXIMAL-NEXT:            affine.store %[[reduc_0_dbl]], %[[arg_1]][%[[cst]], %[[idx_1]]] : memref<1x64xf32, 1>
849// MAXIMAL-NEXT:            %[[reduc_1_sqr:.*]] = arith.mulf %[[results]]#0, %[[results]]#0 : f32
850// MAXIMAL-NEXT:            affine.store %[[reduc_1_sqr]], %[[arg_2]][%[[idx_0]], %[[idx_1]]] : memref<1x64xf32, 1>
851// MAXIMAL-NEXT:          }
852// MAXIMAL-NEXT:        }
853// MAXIMAL-NEXT:        return
854// MAXIMAL-NEXT:      }
855
856// -----
857
858// CHECK-LABEL:   func @reduce_add_non_innermost
859func.func @reduce_add_non_innermost(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) {
860  %cst = arith.constant 0.000000e+00 : f32
861  %cst_0 = arith.constant 1.000000e+00 : f32
862  %0 = memref.alloca() : memref<f32, 1>
863  %1 = memref.alloca() : memref<f32, 1>
864  affine.for %arg3 = 0 to 1 {
865    affine.for %arg4 = 0 to 64 {
866      %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst) -> f32 {
867        %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
868        %5 = arith.addf %prevAccum, %4 : f32
869        affine.yield %5 : f32
870      }
871      %accum_dbl = arith.addf %accum, %accum : f32
872      affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
873    }
874  }
875  affine.for %arg3 = 0 to 1 {
876    affine.for %arg4 = 0 to 64 {
877      %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
878        %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
879        %5 = arith.mulf %prevAccum, %4 : f32
880        affine.yield %5 : f32
881      }
882      %accum_sqr = arith.mulf %accum, %accum : f32
883      affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
884    }
885  }
886  return
887}
888// Test checks the loop structure is preserved after sibling fusion.
889// CHECK:         affine.for
890// CHECK-NEXT:      affine.for
891// CHECK-NEXT:        affine.for
892// CHECK:            affine.for
893
894
895
896// -----
897
898// CHECK-LABEL: func @fuse_large_number_of_loops
899func.func @fuse_large_number_of_loops(%arg0: memref<20x10xf32, 1>, %arg1: memref<20x10xf32, 1>, %arg2: memref<20x10xf32, 1>, %arg3: memref<20x10xf32, 1>, %arg4: memref<20x10xf32, 1>, %arg5: memref<f32, 1>, %arg6: memref<f32, 1>, %arg7: memref<f32, 1>, %arg8: memref<f32, 1>, %arg9: memref<20x10xf32, 1>, %arg10: memref<20x10xf32, 1>, %arg11: memref<20x10xf32, 1>, %arg12: memref<20x10xf32, 1>) {
900  %cst = arith.constant 1.000000e+00 : f32
901  %0 = memref.alloc() : memref<f32, 1>
902  affine.store %cst, %0[] : memref<f32, 1>
903  %1 = memref.alloc() : memref<20x10xf32, 1>
904  affine.for %arg13 = 0 to 20 {
905    affine.for %arg14 = 0 to 10 {
906      %21 = affine.load %arg6[] : memref<f32, 1>
907      affine.store %21, %1[%arg13, %arg14] : memref<20x10xf32, 1>
908    }
909  }
910  %2 = memref.alloc() : memref<20x10xf32, 1>
911  affine.for %arg13 = 0 to 20 {
912    affine.for %arg14 = 0 to 10 {
913      %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1>
914      %22 = affine.load %arg3[%arg13, %arg14] : memref<20x10xf32, 1>
915      %23 = arith.mulf %22, %21 : f32
916      affine.store %23, %2[%arg13, %arg14] : memref<20x10xf32, 1>
917    }
918  }
919  %3 = memref.alloc() : memref<f32, 1>
920  %4 = affine.load %arg6[] : memref<f32, 1>
921  %5 = affine.load %0[] : memref<f32, 1>
922  %6 = arith.subf %5, %4 : f32
923  affine.store %6, %3[] : memref<f32, 1>
924  %7 = memref.alloc() : memref<20x10xf32, 1>
925  affine.for %arg13 = 0 to 20 {
926    affine.for %arg14 = 0 to 10 {
927      %21 = affine.load %3[] : memref<f32, 1>
928      affine.store %21, %7[%arg13, %arg14] : memref<20x10xf32, 1>
929    }
930  }
931  %8 = memref.alloc() : memref<20x10xf32, 1>
932  affine.for %arg13 = 0 to 20 {
933    affine.for %arg14 = 0 to 10 {
934      %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1>
935      %22 = affine.load %7[%arg13, %arg14] : memref<20x10xf32, 1>
936      %23 = arith.mulf %22, %21 : f32
937      affine.store %23, %8[%arg13, %arg14] : memref<20x10xf32, 1>
938    }
939  }
940  %9 = memref.alloc() : memref<20x10xf32, 1>
941  affine.for %arg13 = 0 to 20 {
942    affine.for %arg14 = 0 to 10 {
943      %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1>
944      %22 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1>
945      %23 = arith.mulf %22, %21 : f32
946      affine.store %23, %9[%arg13, %arg14] : memref<20x10xf32, 1>
947    }
948  }
949  affine.for %arg13 = 0 to 20 {
950    affine.for %arg14 = 0 to 10 {
951      %21 = affine.load %9[%arg13, %arg14] : memref<20x10xf32, 1>
952      %22 = affine.load %2[%arg13, %arg14] : memref<20x10xf32, 1>
953      %23 = arith.addf %22, %21 : f32
954      affine.store %23, %arg11[%arg13, %arg14] : memref<20x10xf32, 1>
955    }
956  }
957  %10 = memref.alloc() : memref<20x10xf32, 1>
958  affine.for %arg13 = 0 to 20 {
959    affine.for %arg14 = 0 to 10 {
960      %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1>
961      %22 = affine.load %arg2[%arg13, %arg14] : memref<20x10xf32, 1>
962      %23 = arith.mulf %22, %21 : f32
963      affine.store %23, %10[%arg13, %arg14] : memref<20x10xf32, 1>
964    }
965  }
966  affine.for %arg13 = 0 to 20 {
967    affine.for %arg14 = 0 to 10 {
968      %21 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1>
969      %22 = affine.load %10[%arg13, %arg14] : memref<20x10xf32, 1>
970      %23 = arith.addf %22, %21 : f32
971      affine.store %23, %arg10[%arg13, %arg14] : memref<20x10xf32, 1>
972    }
973  }
974  %11 = memref.alloc() : memref<20x10xf32, 1>
975  affine.for %arg13 = 0 to 20 {
976    affine.for %arg14 = 0 to 10 {
977      %21 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1>
978      %22 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1>
979      %23 = arith.mulf %22, %21 : f32
980      affine.store %23, %11[%arg13, %arg14] : memref<20x10xf32, 1>
981    }
982  }
983  %12 = memref.alloc() : memref<20x10xf32, 1>
984  affine.for %arg13 = 0 to 20 {
985    affine.for %arg14 = 0 to 10 {
986      %21 = affine.load %11[%arg13, %arg14] : memref<20x10xf32, 1>
987      %22 = affine.load %arg11[%arg13, %arg14] : memref<20x10xf32, 1>
988      %23 = arith.subf %22, %21 : f32
989      affine.store %23, %12[%arg13, %arg14] : memref<20x10xf32, 1>
990    }
991  }
992  %13 = memref.alloc() : memref<20x10xf32, 1>
993  affine.for %arg13 = 0 to 20 {
994    affine.for %arg14 = 0 to 10 {
995      %21 = affine.load %arg7[] : memref<f32, 1>
996      affine.store %21, %13[%arg13, %arg14] : memref<20x10xf32, 1>
997    }
998  }
999  %14 = memref.alloc() : memref<20x10xf32, 1>
1000  affine.for %arg13 = 0 to 20 {
1001    affine.for %arg14 = 0 to 10 {
1002      %21 = affine.load %arg4[%arg13, %arg14] : memref<20x10xf32, 1>
1003      %22 = affine.load %13[%arg13, %arg14] : memref<20x10xf32, 1>
1004      %23 = arith.mulf %22, %21 : f32
1005      affine.store %23, %14[%arg13, %arg14] : memref<20x10xf32, 1>
1006    }
1007  }
1008  %15 = memref.alloc() : memref<20x10xf32, 1>
1009  affine.for %arg13 = 0 to 20 {
1010    affine.for %arg14 = 0 to 10 {
1011      %21 = affine.load %arg8[] : memref<f32, 1>
1012      affine.store %21, %15[%arg13, %arg14] : memref<20x10xf32, 1>
1013    }
1014  }
1015  %16 = memref.alloc() : memref<20x10xf32, 1>
1016  affine.for %arg13 = 0 to 20 {
1017    affine.for %arg14 = 0 to 10 {
1018      %21 = affine.load %15[%arg13, %arg14] : memref<20x10xf32, 1>
1019      %22 = affine.load %12[%arg13, %arg14] : memref<20x10xf32, 1>
1020      %23 = arith.addf %22, %21 : f32
1021      affine.store %23, %16[%arg13, %arg14] : memref<20x10xf32, 1>
1022    }
1023  }
1024  %17 = memref.alloc() : memref<20x10xf32, 1>
1025  affine.for %arg13 = 0 to 20 {
1026    affine.for %arg14 = 0 to 10 {
1027      %21 = affine.load %16[%arg13, %arg14] : memref<20x10xf32, 1>
1028      %22 = math.sqrt %21 : f32
1029      affine.store %22, %17[%arg13, %arg14] : memref<20x10xf32, 1>
1030    }
1031  }
1032  %18 = memref.alloc() : memref<20x10xf32, 1>
1033  affine.for %arg13 = 0 to 20 {
1034    affine.for %arg14 = 0 to 10 {
1035      %21 = affine.load %arg5[] : memref<f32, 1>
1036      affine.store %21, %18[%arg13, %arg14] : memref<20x10xf32, 1>
1037    }
1038  }
1039  %19 = memref.alloc() : memref<20x10xf32, 1>
1040  affine.for %arg13 = 0 to 20 {
1041    affine.for %arg14 = 0 to 10 {
1042      %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1>
1043      %22 = affine.load %18[%arg13, %arg14] : memref<20x10xf32, 1>
1044      %23 = arith.mulf %22, %21 : f32
1045      affine.store %23, %19[%arg13, %arg14] : memref<20x10xf32, 1>
1046    }
1047  }
1048  %20 = memref.alloc() : memref<20x10xf32, 1>
1049  affine.for %arg13 = 0 to 20 {
1050    affine.for %arg14 = 0 to 10 {
1051      %21 = affine.load %17[%arg13, %arg14] : memref<20x10xf32, 1>
1052      %22 = affine.load %19[%arg13, %arg14] : memref<20x10xf32, 1>
1053      %23 = arith.divf %22, %21 : f32
1054      affine.store %23, %20[%arg13, %arg14] : memref<20x10xf32, 1>
1055    }
1056  }
1057  affine.for %arg13 = 0 to 20 {
1058    affine.for %arg14 = 0 to 10 {
1059      %21 = affine.load %20[%arg13, %arg14] : memref<20x10xf32, 1>
1060      %22 = affine.load %14[%arg13, %arg14] : memref<20x10xf32, 1>
1061      %23 = arith.addf %22, %21 : f32
1062      affine.store %23, %arg12[%arg13, %arg14] : memref<20x10xf32, 1>
1063    }
1064  }
1065  affine.for %arg13 = 0 to 20 {
1066    affine.for %arg14 = 0 to 10 {
1067      %21 = affine.load %arg12[%arg13, %arg14] : memref<20x10xf32, 1>
1068      %22 = affine.load %arg0[%arg13, %arg14] : memref<20x10xf32, 1>
1069      %23 = arith.subf %22, %21 : f32
1070      affine.store %23, %arg9[%arg13, %arg14] : memref<20x10xf32, 1>
1071    }
1072  }
1073  return
1074}
1075// CHECK:         affine.for
1076// CHECK:         affine.for
1077// CHECK-NOT:     affine.for
1078
1079// CHECK-LABEL: func @alias_escaping_memref
1080func.func @alias_escaping_memref(%a : memref<2x5xf32>) {
1081  %cst = arith.constant 0.000000e+00 : f32
1082  %alias = memref.reinterpret_cast %a to offset: [0], sizes: [10], strides: [1] : memref<2x5xf32> to memref<10xf32>
1083  affine.for %i0 = 0 to 10 {
1084    affine.store %cst, %alias[%i0] : memref<10xf32>
1085  }
1086
1087  affine.for %i1 = 0 to 10 {
1088    %0 = affine.load %alias[%i1] : memref<10xf32>
1089  }
1090  // Fusion happens, but memref isn't privatized since %alias is an alias of a
1091  // function argument.
1092  // CHECK:       memref.reinterpret_cast
1093  // CHECK-NEXT:  affine.for
1094  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1095  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1096  // CHECK-NEXT:  }
1097  // CHECK-NOT:   affine.for
1098
1099  return
1100}
1101
1102// CHECK-LABEL: func @unknown_memref_def_op
1103func.func @unknown_memref_def_op() {
1104  %cst = arith.constant 0.000000e+00 : f32
1105  %may_alias = call @bar() : () -> memref<10xf32>
1106  affine.for %i0 = 0 to 10 {
1107    affine.store %cst, %may_alias[%i0] : memref<10xf32>
1108  }
1109
1110  affine.for %i1 = 0 to 10 {
1111    %0 = affine.load %may_alias[%i1] : memref<10xf32>
1112  }
1113  // Fusion happens, but memref isn't privatized since %may_alias's origin is
1114  // unknown.
1115  // CHECK:       call
1116  // CHECK-NEXT:  affine.for
1117  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1118  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1119  // CHECK-NEXT:  }
1120  // CHECK-NOT:   affine.for
1121
1122  return
1123}
1124func.func private @bar() -> memref<10xf32>
1125
1126
1127// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir
1128