xref: /llvm-project/mlir/test/Dialect/Affine/scalrep.mlir (revision 1be9a80768a03ea9bd2bfbb03762b2bc3c350007)
1// RUN: mlir-opt -allow-unregistered-dialect %s -affine-scalrep | FileCheck %s
2
3// CHECK-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> (d1 + 1)>
4// CHECK-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0, d1) -> (d0)>
5// CHECK-DAG: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> (d1)>
6// CHECK-DAG: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 - 1)>
7// CHECK-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
8// CHECK-DAG: [[$IDENT:#map[0-9]*]] = affine_map<(d0) -> (d0)>
9
10// CHECK-LABEL: func @simple_store_load() {
11func.func @simple_store_load() {
12  %cf7 = arith.constant 7.0 : f32
13  %m = memref.alloc() : memref<10xf32>
14  affine.for %i0 = 0 to 10 {
15    affine.store %cf7, %m[%i0] : memref<10xf32>
16    %v0 = affine.load %m[%i0] : memref<10xf32>
17    %v1 = arith.addf %v0, %v0 : f32
18  }
19  memref.dealloc %m : memref<10xf32>
20  return
21// CHECK:       %[[C7:.*]] = arith.constant 7.000000e+00 : f32
22// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
23// CHECK-NEXT:    arith.addf %[[C7]], %[[C7]] : f32
24// CHECK-NEXT:  }
25// CHECK-NEXT:  return
26}
27
28// CHECK-LABEL: func @multi_store_load() {
29func.func @multi_store_load() {
30  %cf7 = arith.constant 7.0 : f32
31  %cf8 = arith.constant 8.0 : f32
32  %cf9 = arith.constant 9.0 : f32
33  %m = gpu.alloc() : memref<10xf32>
34  affine.for %i0 = 0 to 10 {
35    affine.store %cf7, %m[%i0] : memref<10xf32>
36    %v0 = affine.load %m[%i0] : memref<10xf32>
37    %v1 = arith.addf %v0, %v0 : f32
38    affine.store %cf8, %m[%i0] : memref<10xf32>
39    affine.store %cf9, %m[%i0] : memref<10xf32>
40    %v2 = affine.load %m[%i0] : memref<10xf32>
41    %v3 = affine.load %m[%i0] : memref<10xf32>
42    %v4 = arith.mulf %v2, %v3 : f32
43  }
44  gpu.dealloc %m : memref<10xf32>
45  return
46// CHECK-NEXT:  %[[C7:.*]] = arith.constant 7.000000e+00 : f32
47// CHECK-NEXT:  arith.constant 8.000000e+00 : f32
48// CHECK-NEXT:  %[[C9:.*]] = arith.constant 9.000000e+00 : f32
49// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
50// CHECK-NEXT:    arith.addf %[[C7]], %[[C7]] : f32
51// CHECK-NEXT:    arith.mulf %[[C9]], %[[C9]] : f32
52// CHECK-NEXT:  }
53// CHECK-NEXT:  return
54}
55
56// The store-load forwarding can see through affine apply's since it relies on
57// dependence information.
58// CHECK-LABEL: func @store_load_affine_apply
59func.func @store_load_affine_apply() -> memref<10x10xf32> {
60  %cf7 = arith.constant 7.0 : f32
61  %m = memref.alloc() : memref<10x10xf32>
62  affine.for %i0 = 0 to 10 {
63    affine.for %i1 = 0 to 10 {
64      %t0 = affine.apply affine_map<(d0, d1) -> (d1 + 1)>(%i0, %i1)
65      %t1 = affine.apply affine_map<(d0, d1) -> (d0)>(%i0, %i1)
66      %idx0 = affine.apply affine_map<(d0, d1) -> (d1)> (%t0, %t1)
67      %idx1 = affine.apply affine_map<(d0, d1) -> (d0 - 1)> (%t0, %t1)
68      affine.store %cf7, %m[%idx0, %idx1] : memref<10x10xf32>
69      // CHECK-NOT: affine.load %{{[0-9]+}}
70      %v0 = affine.load %m[%i0, %i1] : memref<10x10xf32>
71      %v1 = arith.addf %v0, %v0 : f32
72    }
73  }
74  // The memref and its stores won't be erased due to this memref return.
75  return %m : memref<10x10xf32>
76// CHECK:       %{{.*}} = arith.constant 7.000000e+00 : f32
77// CHECK-NEXT:  %{{.*}} = memref.alloc() : memref<10x10xf32>
78// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
79// CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
80// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
81// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
82// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
83// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP3]](%{{.*}}, %{{.*}})
84// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
85// CHECK-NEXT:      %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
86// CHECK-NEXT:    }
87// CHECK-NEXT:  }
88// CHECK-NEXT:  return %{{.*}} : memref<10x10xf32>
89}
90
91// CHECK-LABEL: func @store_load_nested
92func.func @store_load_nested(%N : index) {
93  %cf7 = arith.constant 7.0 : f32
94  %m = memref.alloc() : memref<10xf32>
95  affine.for %i0 = 0 to 10 {
96    affine.store %cf7, %m[%i0] : memref<10xf32>
97    affine.for %i1 = 0 to %N {
98      %v0 = affine.load %m[%i0] : memref<10xf32>
99      %v1 = arith.addf %v0, %v0 : f32
100    }
101  }
102  return
103// CHECK:       %{{.*}} = arith.constant 7.000000e+00 : f32
104// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
105// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
106// CHECK-NEXT:      %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
107// CHECK-NEXT:    }
108// CHECK-NEXT:  }
109// CHECK-NEXT:  return
110}
111
112// No forwarding happens here since either of the two stores could be the last
113// writer; store/load forwarding will however be possible here once loop live
114// out SSA scalars are available.
115// CHECK-LABEL: func @multi_store_load_nested_no_fwd
116func.func @multi_store_load_nested_no_fwd(%N : index) {
117  %cf7 = arith.constant 7.0 : f32
118  %cf8 = arith.constant 8.0 : f32
119  %m = memref.alloc() : memref<10xf32>
120  affine.for %i0 = 0 to 10 {
121    affine.store %cf7, %m[%i0] : memref<10xf32>
122    affine.for %i1 = 0 to %N {
123      affine.store %cf8, %m[%i1] : memref<10xf32>
124    }
125    affine.for %i2 = 0 to %N {
126      // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
127      %v0 = affine.load %m[%i0] : memref<10xf32>
128      %v1 = arith.addf %v0, %v0 : f32
129    }
130  }
131  return
132}
133
134// No forwarding happens here since both stores have a value going into
135// the load.
136// CHECK-LABEL: func @store_load_store_nested_no_fwd
137func.func @store_load_store_nested_no_fwd(%N : index) {
138  %cf7 = arith.constant 7.0 : f32
139  %cf9 = arith.constant 9.0 : f32
140  %m = memref.alloc() : memref<10xf32>
141  affine.for %i0 = 0 to 10 {
142    affine.store %cf7, %m[%i0] : memref<10xf32>
143    affine.for %i1 = 0 to %N {
144      // CHECK: %{{[0-9]+}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
145      %v0 = affine.load %m[%i0] : memref<10xf32>
146      %v1 = arith.addf %v0, %v0 : f32
147      affine.store %cf9, %m[%i0] : memref<10xf32>
148    }
149  }
150  return
151}
152
153// Forwarding happens here since the last store postdominates all other stores
154// and other forwarding criteria are satisfied.
155// CHECK-LABEL: func @multi_store_load_nested_fwd
156func.func @multi_store_load_nested_fwd(%N : index) {
157  %cf7 = arith.constant 7.0 : f32
158  %cf8 = arith.constant 8.0 : f32
159  %cf9 = arith.constant 9.0 : f32
160  %cf10 = arith.constant 10.0 : f32
161  %m = memref.alloc() : memref<10xf32>
162  affine.for %i0 = 0 to 10 {
163    affine.store %cf7, %m[%i0] : memref<10xf32>
164    affine.for %i1 = 0 to %N {
165      affine.store %cf8, %m[%i1] : memref<10xf32>
166    }
167    affine.for %i2 = 0 to %N {
168      affine.store %cf9, %m[%i2] : memref<10xf32>
169    }
170    affine.store %cf10, %m[%i0] : memref<10xf32>
171    affine.for %i3 = 0 to %N {
172      // CHECK-NOT: %{{[0-9]+}} = affine.load
173      %v0 = affine.load %m[%i0] : memref<10xf32>
174      %v1 = arith.addf %v0, %v0 : f32
175    }
176  }
177  return
178}
179
180// There is no unique load location for the store to forward to.
181// CHECK-LABEL: func @store_load_no_fwd
182func.func @store_load_no_fwd() {
183  %cf7 = arith.constant 7.0 : f32
184  %m = memref.alloc() : memref<10xf32>
185  affine.for %i0 = 0 to 10 {
186    affine.store %cf7, %m[%i0] : memref<10xf32>
187    affine.for %i1 = 0 to 10 {
188      affine.for %i2 = 0 to 10 {
189        // CHECK: affine.load
190        %v0 = affine.load %m[%i2] : memref<10xf32>
191        %v1 = arith.addf %v0, %v0 : f32
192      }
193    }
194  }
195  return
196}
197
198// Forwarding happens here as there is a one-to-one store-load correspondence.
199// CHECK-LABEL: func @store_load_fwd
200func.func @store_load_fwd() {
201  %cf7 = arith.constant 7.0 : f32
202  %c0 = arith.constant 0 : index
203  %m = memref.alloc() : memref<10xf32>
204  affine.store %cf7, %m[%c0] : memref<10xf32>
205  affine.for %i0 = 0 to 10 {
206    affine.for %i1 = 0 to 10 {
207      affine.for %i2 = 0 to 10 {
208        // CHECK-NOT: affine.load %{{[0-9]}}+
209        %v0 = affine.load %m[%c0] : memref<10xf32>
210        %v1 = arith.addf %v0, %v0 : f32
211      }
212    }
213  }
214  return
215}
216
217// Although there is a dependence from the second store to the load, it is
218// satisfied by the outer surrounding loop, and does not prevent the first
219// store to be forwarded to the load.
220func.func @store_load_store_nested_fwd(%N : index) -> f32 {
221  %cf7 = arith.constant 7.0 : f32
222  %cf9 = arith.constant 9.0 : f32
223  %c0 = arith.constant 0 : index
224  %c1 = arith.constant 1 : index
225  %m = memref.alloc() : memref<10xf32>
226  affine.for %i0 = 0 to 10 {
227    affine.store %cf7, %m[%i0] : memref<10xf32>
228    affine.for %i1 = 0 to %N {
229      %v0 = affine.load %m[%i0] : memref<10xf32>
230      %v1 = arith.addf %v0, %v0 : f32
231      %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i0)
232      affine.store %cf9, %m[%idx] : memref<10xf32>
233    }
234  }
235  // Due to this load, the memref isn't optimized away.
236  %v3 = affine.load %m[%c1] : memref<10xf32>
237  return %v3 : f32
238// CHECK:       %{{.*}} = memref.alloc() : memref<10xf32>
239// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
240// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
241// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
242// CHECK-NEXT:      %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
243// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP4]](%{{.*}})
244// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
245// CHECK-NEXT:    }
246// CHECK-NEXT:  }
247// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
248// CHECK-NEXT:  return %{{.*}} : f32
249}
250
251// CHECK-LABEL: func @should_not_fwd
252func.func @should_not_fwd(%A: memref<100xf32>, %M : index, %N : index) -> f32 {
253  %cf = arith.constant 0.0 : f32
254  affine.store %cf, %A[%M] : memref<100xf32>
255  // CHECK: affine.load %{{.*}}[%{{.*}}]
256  %v = affine.load %A[%N] : memref<100xf32>
257  return %v : f32
258}
259
260// Can store forward to A[%j, %i], but no forwarding to load on %A[%i, %j]
261// CHECK-LABEL: func @refs_not_known_to_be_equal
262func.func @refs_not_known_to_be_equal(%A : memref<100 x 100 x f32>, %M : index) {
263  %N = affine.apply affine_map<(d0) -> (d0 + 1)> (%M)
264  %cf1 = arith.constant 1.0 : f32
265  affine.for %i = 0 to 100 {
266  // CHECK: affine.for %[[I:.*]] =
267    affine.for %j = 0 to 100 {
268    // CHECK: affine.for %[[J:.*]] =
269      // CHECK: affine.load %{{.*}}[%[[I]], %[[J]]]
270      %u = affine.load %A[%i, %j] : memref<100x100xf32>
271      // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[J]], %[[I]]]
272      affine.store %cf1, %A[%j, %i] : memref<100x100xf32>
273      // CHECK-NEXT: affine.load %{{.*}}[%[[I]], %[[J]]]
274      %v = affine.load %A[%i, %j] : memref<100x100xf32>
275      // This load should disappear.
276      %w = affine.load %A[%j, %i] : memref<100x100xf32>
277      // CHECK-NEXT: "foo"
278      "foo" (%u, %v, %w) : (f32, f32, f32) -> ()
279    }
280  }
281  return
282}
283
284// CHECK-LABEL: func @elim_load_after_store
285func.func @elim_load_after_store(%arg0: memref<100xf32>, %arg1: memref<100xf32>) {
286  %alloc = memref.alloc() : memref<1xf32>
287  %alloc_0 = memref.alloc() : memref<1xf32>
288  // CHECK: affine.for
289  affine.for %arg2 = 0 to 100 {
290    // CHECK: affine.load
291    %0 = affine.load %arg0[%arg2] : memref<100xf32>
292    %1 = affine.load %arg0[%arg2] : memref<100xf32>
293    // CHECK: arith.addf
294    %2 = arith.addf %0, %1 : f32
295    affine.store %2, %alloc_0[0] : memref<1xf32>
296    %3 = affine.load %arg0[%arg2] : memref<100xf32>
297    %4 = affine.load %alloc_0[0] : memref<1xf32>
298    // CHECK-NEXT: arith.addf
299    %5 = arith.addf %3, %4 : f32
300    affine.store %5, %alloc[0] : memref<1xf32>
301    %6 = affine.load %arg0[%arg2] : memref<100xf32>
302    %7 = affine.load %alloc[0] : memref<1xf32>
303    %8 = arith.addf %6, %7 : f32
304    affine.store %8, %arg1[%arg2] : memref<100xf32>
305  }
306  return
307}
308
309// The test checks for value forwarding from vector stores to vector loads.
310// The value loaded from %in can directly be stored to %out by eliminating
311// store and load from %tmp.
312func.func @vector_forwarding(%in : memref<512xf32>, %out : memref<512xf32>) {
313  %tmp = memref.alloc() : memref<512xf32>
314  affine.for %i = 0 to 16 {
315    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
316    affine.vector_store %ld0, %tmp[32*%i] : memref<512xf32>, vector<32xf32>
317    %ld1 = affine.vector_load %tmp[32*%i] : memref<512xf32>, vector<32xf32>
318    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
319  }
320  return
321}
322
323// CHECK-LABEL: func @vector_forwarding
324// CHECK:      affine.for %{{.*}} = 0 to 16 {
325// CHECK-NEXT:   %[[LDVAL:.*]] = affine.vector_load
326// CHECK-NEXT:   affine.vector_store %[[LDVAL]],{{.*}}
327// CHECK-NEXT: }
328
329func.func @vector_no_forwarding(%in : memref<512xf32>, %out : memref<512xf32>) {
330  %tmp = memref.alloc() : memref<512xf32>
331  affine.for %i = 0 to 16 {
332    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
333    affine.vector_store %ld0, %tmp[32*%i] : memref<512xf32>, vector<32xf32>
334    %ld1 = affine.vector_load %tmp[32*%i] : memref<512xf32>, vector<16xf32>
335    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<16xf32>
336  }
337  return
338}
339
340// CHECK-LABEL: func @vector_no_forwarding
341// CHECK:      affine.for %{{.*}} = 0 to 16 {
342// CHECK-NEXT:   %[[LDVAL:.*]] = affine.vector_load
343// CHECK-NEXT:   affine.vector_store %[[LDVAL]],{{.*}}
344// CHECK-NEXT:   %[[LDVAL1:.*]] = affine.vector_load
345// CHECK-NEXT:   affine.vector_store %[[LDVAL1]],{{.*}}
346// CHECK-NEXT: }
347
348// CHECK-LABEL: func @simple_three_loads
349func.func @simple_three_loads(%in : memref<10xf32>) {
350  affine.for %i0 = 0 to 10 {
351    // CHECK:       affine.load
352    %v0 = affine.load %in[%i0] : memref<10xf32>
353    // CHECK-NOT:   affine.load
354    %v1 = affine.load %in[%i0] : memref<10xf32>
355    %v2 = arith.addf %v0, %v1 : f32
356    %v3 = affine.load %in[%i0] : memref<10xf32>
357    %v4 = arith.addf %v2, %v3 : f32
358  }
359  return
360}
361
362// CHECK-LABEL: func @nested_loads_const_index
363func.func @nested_loads_const_index(%in : memref<10xf32>) {
364  %c0 = arith.constant 0 : index
365  // CHECK:       affine.load
366  %v0 = affine.load %in[%c0] : memref<10xf32>
367  affine.for %i0 = 0 to 10 {
368    affine.for %i1 = 0 to 20 {
369      affine.for %i2 = 0 to 30 {
370        // CHECK-NOT:   affine.load
371        %v1 = affine.load %in[%c0] : memref<10xf32>
372        %v2 = arith.addf %v0, %v1 : f32
373      }
374    }
375  }
376  return
377}
378
379// CHECK-LABEL: func @nested_loads
380func.func @nested_loads(%N : index, %in : memref<10xf32>) {
381  affine.for %i0 = 0 to 10 {
382    // CHECK:       affine.load
383    %v0 = affine.load %in[%i0] : memref<10xf32>
384    affine.for %i1 = 0 to %N {
385      // CHECK-NOT:   affine.load
386      %v1 = affine.load %in[%i0] : memref<10xf32>
387      %v2 = arith.addf %v0, %v1 : f32
388    }
389  }
390  return
391}
392
393// CHECK-LABEL: func @nested_loads_different_memref_accesses_no_cse
394func.func @nested_loads_different_memref_accesses_no_cse(%in : memref<10xf32>) {
395  affine.for %i0 = 0 to 10 {
396    // CHECK:       affine.load
397    %v0 = affine.load %in[%i0] : memref<10xf32>
398    affine.for %i1 = 0 to 20 {
399      // CHECK:       affine.load
400      %v1 = affine.load %in[%i1] : memref<10xf32>
401      %v2 = arith.addf %v0, %v1 : f32
402    }
403  }
404  return
405}
406
407// CHECK-LABEL: func @load_load_store
408func.func @load_load_store(%m : memref<10xf32>) {
409  affine.for %i0 = 0 to 10 {
410    // CHECK:       affine.load
411    %v0 = affine.load %m[%i0] : memref<10xf32>
412    // CHECK-NOT:       affine.load
413    %v1 = affine.load %m[%i0] : memref<10xf32>
414    %v2 = arith.addf %v0, %v1 : f32
415    affine.store %v2, %m[%i0] : memref<10xf32>
416  }
417  return
418}
419
420// CHECK-LABEL: func @load_load_store_2_loops_no_cse
421func.func @load_load_store_2_loops_no_cse(%N : index, %m : memref<10xf32>) {
422  affine.for %i0 = 0 to 10 {
423    // CHECK:       affine.load
424    %v0 = affine.load %m[%i0] : memref<10xf32>
425    affine.for %i1 = 0 to %N {
426      // CHECK:       affine.load
427      %v1 = affine.load %m[%i0] : memref<10xf32>
428      %v2 = arith.addf %v0, %v1 : f32
429      affine.store %v2, %m[%i0] : memref<10xf32>
430    }
431  }
432  return
433}
434
435// CHECK-LABEL: func @load_load_store_3_loops_no_cse
436func.func @load_load_store_3_loops_no_cse(%m : memref<10xf32>) {
437%cf1 = arith.constant 1.0 : f32
438  affine.for %i0 = 0 to 10 {
439    // CHECK:       affine.load
440    %v0 = affine.load %m[%i0] : memref<10xf32>
441    affine.for %i1 = 0 to 20 {
442      affine.for %i2 = 0 to 30 {
443        // CHECK:       affine.load
444        %v1 = affine.load %m[%i0] : memref<10xf32>
445        %v2 = arith.addf %v0, %v1 : f32
446      }
447      affine.store %cf1, %m[%i0] : memref<10xf32>
448    }
449  }
450  return
451}
452
453// CHECK-LABEL: func @load_load_store_3_loops
454func.func @load_load_store_3_loops(%m : memref<10xf32>) {
455%cf1 = arith.constant 1.0 : f32
456  affine.for %i0 = 0 to 10 {
457    affine.for %i1 = 0 to 20 {
458      // CHECK:       affine.load
459      %v0 = affine.load %m[%i0] : memref<10xf32>
460      affine.for %i2 = 0 to 30 {
461        // CHECK-NOT:   affine.load
462        %v1 = affine.load %m[%i0] : memref<10xf32>
463        %v2 = arith.addf %v0, %v1 : f32
464      }
465    }
466    affine.store %cf1, %m[%i0] : memref<10xf32>
467  }
468  return
469}
470
471// CHECK-LABEL: func @loads_in_sibling_loops_const_index_no_cse
472func.func @loads_in_sibling_loops_const_index_no_cse(%m : memref<10xf32>) {
473  %c0 = arith.constant 0 : index
474  affine.for %i0 = 0 to 10 {
475    // CHECK:       affine.load
476    %v0 = affine.load %m[%c0] : memref<10xf32>
477  }
478  affine.for %i1 = 0 to 10 {
479    // CHECK:       affine.load
480    %v0 = affine.load %m[%c0] : memref<10xf32>
481    %v1 = arith.addf %v0, %v0 : f32
482  }
483  return
484}
485
486// CHECK-LABEL: func @load_load_affine_apply
487func.func @load_load_affine_apply(%in : memref<10x10xf32>) {
488  affine.for %i0 = 0 to 10 {
489    affine.for %i1 = 0 to 10 {
490      %t0 = affine.apply affine_map<(d0, d1) -> (d1 + 1)>(%i0, %i1)
491      %t1 = affine.apply affine_map<(d0, d1) -> (d0)>(%i0, %i1)
492      %idx0 = affine.apply affine_map<(d0, d1) -> (d1)> (%t0, %t1)
493      %idx1 = affine.apply affine_map<(d0, d1) -> (d0 - 1)> (%t0, %t1)
494      // CHECK:       affine.load
495      %v0 = affine.load %in[%idx0, %idx1] : memref<10x10xf32>
496      // CHECK-NOT:   affine.load
497      %v1 = affine.load %in[%i0, %i1] : memref<10x10xf32>
498      %v2 = arith.addf %v0, %v1 : f32
499    }
500  }
501  return
502}
503
504// CHECK-LABEL: func @vector_loads
505func.func @vector_loads(%in : memref<512xf32>, %out : memref<512xf32>) {
506  affine.for %i = 0 to 16 {
507    // CHECK:       affine.vector_load
508    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
509    // CHECK-NOT:   affine.vector_load
510    %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
511    %add = arith.addf %ld0, %ld1 : vector<32xf32>
512    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
513  }
514  return
515}
516
517// CHECK-LABEL: func @vector_loads_no_cse
518func.func @vector_loads_no_cse(%in : memref<512xf32>, %out : memref<512xf32>) {
519  affine.for %i = 0 to 16 {
520    // CHECK:       affine.vector_load
521    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
522    // CHECK:   affine.vector_load
523    %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<16xf32>
524    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<16xf32>
525  }
526  return
527}
528
529// CHECK-LABEL: func @vector_load_store_load_no_cse
530func.func @vector_load_store_load_no_cse(%in : memref<512xf32>, %out : memref<512xf32>) {
531  affine.for %i = 0 to 16 {
532    // CHECK:       affine.vector_load
533    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
534    affine.vector_store %ld0, %in[16*%i] : memref<512xf32>, vector<32xf32>
535    // CHECK:       affine.vector_load
536    %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
537    %add = arith.addf %ld0, %ld1 : vector<32xf32>
538    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
539  }
540  return
541}
542
543// CHECK-LABEL: func @reduction_multi_store
544func.func @reduction_multi_store() -> memref<1xf32> {
545  %A = memref.alloc() : memref<1xf32>
546  %cf0 = arith.constant 0.0 : f32
547  %cf5 = arith.constant 5.0 : f32
548
549 affine.store %cf0, %A[0] : memref<1xf32>
550  affine.for %i = 0 to 100 step 2 {
551    %l = affine.load %A[0] : memref<1xf32>
552    %s = arith.addf %l, %cf5 : f32
553    // Store to load forwarding from this store should happen.
554    affine.store %s, %A[0] : memref<1xf32>
555    %m = affine.load %A[0] : memref<1xf32>
556   "test.foo"(%m) : (f32) -> ()
557  }
558
559// CHECK:       affine.for
560// CHECK:         affine.load
561// CHECK:         affine.store %[[S:.*]],
562// CHECK-NEXT:    "test.foo"(%[[S]])
563
564  return %A : memref<1xf32>
565}
566
567// CHECK-LABEL: func @vector_load_affine_apply_store_load
568func.func @vector_load_affine_apply_store_load(%in : memref<512xf32>, %out : memref<512xf32>) {
569  %cf1 = arith.constant 1: index
570  affine.for %i = 0 to 15 {
571    // CHECK:       affine.vector_load
572    %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
573    %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i)
574    affine.vector_store %ld0, %in[32*%idx] : memref<512xf32>, vector<32xf32>
575    // CHECK-NOT:   affine.vector_load
576    %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
577    %add = arith.addf %ld0, %ld1 : vector<32xf32>
578    affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
579  }
580  return
581}
582
583// CHECK-LABEL: func @external_no_forward_load
584
585func.func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) {
586  affine.for %i = 0 to 16 {
587    %ld0 = affine.load %in[32*%i] : memref<512xf32>
588    affine.store %ld0, %out[32*%i] : memref<512xf32>
589    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
590    %ld1 = affine.load %in[32*%i] : memref<512xf32>
591    affine.store %ld1, %out[32*%i] : memref<512xf32>
592  }
593  return
594}
595// CHECK:   affine.load
596// CHECK:   affine.store
597// CHECK:   affine.load
598// CHECK:   affine.store
599
600// CHECK-LABEL: func @external_no_forward_store
601
602func.func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) {
603  %cf1 = arith.constant 1.0 : f32
604  affine.for %i = 0 to 16 {
605    affine.store %cf1, %in[32*%i] : memref<512xf32>
606    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
607    %ld1 = affine.load %in[32*%i] : memref<512xf32>
608    affine.store %ld1, %out[32*%i] : memref<512xf32>
609  }
610  return
611}
612// CHECK:   affine.store
613// CHECK:   affine.load
614// CHECK:   affine.store
615
616// CHECK-LABEL: func @no_forward_cast
617
618func.func @no_forward_cast(%in : memref<512xf32>, %out : memref<512xf32>) {
619  %cf1 = arith.constant 1.0 : f32
620  %cf2 = arith.constant 2.0 : f32
621  %m2 = memref.cast %in : memref<512xf32> to memref<?xf32>
622  affine.for %i = 0 to 16 {
623    affine.store %cf1, %in[32*%i] : memref<512xf32>
624    affine.store %cf2, %m2[32*%i] : memref<?xf32>
625    %ld1 = affine.load %in[32*%i] : memref<512xf32>
626    affine.store %ld1, %out[32*%i] : memref<512xf32>
627  }
628  return
629}
630// CHECK:   affine.store
631// CHECK-NEXT:   affine.store
632// CHECK-NEXT:   affine.load
633// CHECK-NEXT:   affine.store
634
635// Although there is a dependence from the second store to the load, it is
636// satisfied by the outer surrounding loop, and does not prevent the first
637// store to be forwarded to the load.
638
639// CHECK-LABEL: func @overlap_no_fwd
640func.func @overlap_no_fwd(%N : index) -> f32 {
641  %cf7 = arith.constant 7.0 : f32
642  %cf9 = arith.constant 9.0 : f32
643  %c0 = arith.constant 0 : index
644  %c1 = arith.constant 1 : index
645  %m = memref.alloc() : memref<10xf32>
646  affine.for %i0 = 0 to 5 {
647    affine.store %cf7, %m[2 * %i0] : memref<10xf32>
648    affine.for %i1 = 0 to %N {
649      %v0 = affine.load %m[2 * %i0] : memref<10xf32>
650      %v1 = arith.addf %v0, %v0 : f32
651      affine.store %cf9, %m[%i0 + 1] : memref<10xf32>
652    }
653  }
654  // Due to this load, the memref isn't optimized away.
655  %v3 = affine.load %m[%c1] : memref<10xf32>
656  return %v3 : f32
657
658// CHECK:  affine.for %{{.*}} = 0 to 5 {
659// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
660// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
661// CHECK-NEXT:      %{{.*}} = affine.load
662// CHECK-NEXT:      %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
663// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
664// CHECK-NEXT:    }
665// CHECK-NEXT:  }
666// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
667// CHECK-NEXT:  return %{{.*}} : f32
668}
669
670// CHECK-LABEL: func @redundant_store_elim
671
672func.func @redundant_store_elim(%out : memref<512xf32>) {
673  %cf1 = arith.constant 1.0 : f32
674  %cf2 = arith.constant 2.0 : f32
675  affine.for %i = 0 to 16 {
676    affine.store %cf1, %out[32*%i] : memref<512xf32>
677    affine.store %cf2, %out[32*%i] : memref<512xf32>
678  }
679  return
680}
681
682// CHECK: affine.for
683// CHECK-NEXT:   affine.store
684// CHECK-NEXT: }
685
686// CHECK-LABEL: func @redundant_store_elim_nonintervening
687
688func.func @redundant_store_elim_nonintervening(%in : memref<512xf32>) {
689  %cf1 = arith.constant 1.0 : f32
690  %out = memref.alloc() :  memref<512xf32>
691  affine.for %i = 0 to 16 {
692    affine.store %cf1, %out[32*%i] : memref<512xf32>
693    %0 = affine.load %in[32*%i] : memref<512xf32>
694    affine.store %0, %out[32*%i] : memref<512xf32>
695  }
696  return
697}
698
699// CHECK: affine.for
700// CHECK-NEXT:   affine.load
701// CHECK-NEXT:   affine.store
702// CHECK-NEXT: }
703
704// CHECK-LABEL: func @redundant_store_elim_fail
705
706func.func @redundant_store_elim_fail(%out : memref<512xf32>) {
707  %cf1 = arith.constant 1.0 : f32
708  %cf2 = arith.constant 2.0 : f32
709  affine.for %i = 0 to 16 {
710    affine.store %cf1, %out[32*%i] : memref<512xf32>
711    "test.use"(%out) : (memref<512xf32>) -> ()
712    affine.store %cf2, %out[32*%i] : memref<512xf32>
713  }
714  return
715}
716// CHECK: affine.for
717// CHECK-NEXT:   affine.store
718// CHECK-NEXT:   "test.use"
719// CHECK-NEXT:   affine.store
720// CHECK-NEXT: }
721
722// CHECK-LABEL: @with_inner_ops
723func.func @with_inner_ops(%arg0: memref<?xf64>, %arg1: memref<?xf64>, %arg2: i1) {
724  %cst = arith.constant 0.000000e+00 : f64
725  %cst_0 = arith.constant 3.140000e+00 : f64
726  %cst_1 = arith.constant 1.000000e+00 : f64
727  affine.for %arg3 = 0 to 28 {
728    affine.store %cst, %arg1[%arg3] : memref<?xf64>
729    affine.store %cst_0, %arg1[%arg3] : memref<?xf64>
730    %0 = scf.if %arg2 -> (f64) {
731      scf.yield %cst_1 : f64
732    } else {
733      %1 = affine.load %arg1[%arg3] : memref<?xf64>
734      scf.yield %1 : f64
735    }
736    affine.store %0, %arg0[%arg3] : memref<?xf64>
737  }
738  return
739}
740
741// CHECK:  %[[pi:.+]] = arith.constant 3.140000e+00 : f64
742// CHECK:  %{{.*}} = scf.if %arg2 -> (f64) {
743// CHECK:        scf.yield %{{.*}} : f64
744// CHECK:      } else {
745// CHECK:        scf.yield %[[pi]] : f64
746// CHECK:      }
747
748// Check if scalar replacement works correctly when affine memory ops are in the
749// body of an scf.for.
750
751// CHECK-LABEL: func @affine_store_load_in_scope
752func.func @affine_store_load_in_scope(%memref: memref<1x4094x510x1xf32>, %memref_2: memref<4x4x1x64xf32>, %memref_0: memref<1x2046x254x1x64xf32>) {
753  %c0 = arith.constant 0 : index
754  %c1 = arith.constant 1 : index
755  %c2 = arith.constant 2 : index
756  %c64 = arith.constant 64 : index
757  %c768 = arith.constant 768 : index
758  scf.for %i = %c0 to %c768 step %c1 {
759    %9 = arith.remsi %i, %c64 : index
760    %10 = arith.divsi %i, %c64 : index
761    %11 = arith.remsi %10, %c2 : index
762    %12 = arith.divsi %10, %c2 : index
763    test.affine_scope {
764      %14 = arith.muli %12, %c2 : index
765      %15 = arith.addi %c2, %14 : index
766      %16 = arith.addi %15, %c0 : index
767      %18 = arith.muli %11, %c2 : index
768      %19 = arith.addi %c2, %18 : index
769      %20 = affine.load %memref[0, symbol(%16), symbol(%19), 0] : memref<1x4094x510x1xf32>
770      %21 = affine.load %memref_2[0, 0, 0, symbol(%9)] : memref<4x4x1x64xf32>
771      %24 = affine.load %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32>
772      %25 = arith.mulf %20, %21 : f32
773      %26 = arith.addf %24, %25 : f32
774      // CHECK: %[[A:.*]] = arith.addf
775      affine.store %26, %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32>
776      %27 = arith.addi %19, %c1 : index
777      %28 = affine.load %memref[0, symbol(%16), symbol(%27), 0] : memref<1x4094x510x1xf32>
778      %29 = affine.load %memref_2[0, 1, 0, symbol(%9)] : memref<4x4x1x64xf32>
779      %30 = affine.load %memref_0[0, symbol(%12), symbol(%11), 0, symbol(%9)] : memref<1x2046x254x1x64xf32>
780      %31 = arith.mulf %28, %29 : f32
781      %32 = arith.addf %30, %31 : f32
782      // The addf above will get the forwarded value from the store on
783      // %memref_0 above which is being loaded into %30..
784      // CHECK: arith.addf %[[A]],
785      "terminate"() : () -> ()
786    }
787  }
788  return
789}
790
791// No scalrep will be performed here but we ensure dependence correctly fails.
792
793// CHECK-LABEL: func @affine_load_store_in_different_scopes
794func.func @affine_load_store_in_different_scopes() -> memref<1xf32> {
795  %A = memref.alloc() : memref<1xf32>
796  %cf0 = arith.constant 0.0 : f32
797  %cf5 = arith.constant 5.0 : f32
798
799  affine.store %cf0, %A[0] : memref<1xf32>
800  test.affine_scope {
801    affine.store %cf5, %A[0] : memref<1xf32>
802    "test.terminate"() : () -> ()
803  }
804  %v = affine.load %A[0] : memref<1xf32>
805  // CHECK:      affine.store
806  // CHECK-NEXT: test.affine_scope
807  // CHECK:        affine.store
808  // CHECK:      affine.load
809  return %A : memref<1xf32>
810}
811
812// No forwarding should again happen here.
813
814// CHECK-LABEL: func.func @no_forwarding_across_scopes
815func.func @no_forwarding_across_scopes() -> memref<1xf32> {
816  %A = memref.alloc() : memref<1xf32>
817  %cf0 = arith.constant 0.0 : f32
818  %cf5 = arith.constant 5.0 : f32
819  %c0 = arith.constant 0 : index
820  %c100 = arith.constant 100 : index
821  %c1 = arith.constant 1 : index
822
823  // Store shouldn't be forwarded to the load.
824  affine.store %cf0, %A[0] : memref<1xf32>
825  // CHECK:      test.affine_scope
826  // CHECK-NEXT:   affine.load
827  test.affine_scope {
828    %l = affine.load %A[0] : memref<1xf32>
829    %s = arith.addf %l, %cf5 : f32
830    affine.store %s, %A[0] : memref<1xf32>
831    "terminator"() : () -> ()
832  }
833  return %A : memref<1xf32>
834}
835
836// CHECK-LABEL: func @parallel_store_load() {
837func.func @parallel_store_load() {
838  %cf7 = arith.constant 7.0 : f32
839  %m = memref.alloc() : memref<10xf32>
840  affine.parallel (%i0) = (0) to (10) {
841    affine.store %cf7, %m[%i0] : memref<10xf32>
842    %v0 = affine.load %m[%i0] : memref<10xf32>
843    %v1 = arith.addf %v0, %v0 : f32
844  }
845  memref.dealloc %m : memref<10xf32>
846  return
847// CHECK:       %[[C7:.*]] = arith.constant 7.000000e+00 : f32
848// CHECK-NEXT:  affine.parallel (%{{.*}}) = (0) to (10) {
849// CHECK-NEXT:    arith.addf %[[C7]], %[[C7]] : f32
850// CHECK-NEXT:  }
851// CHECK-NEXT:  return
852}
853
854func.func @non_constant_parallel_store_load(%N : index) {
855  %cf7 = arith.constant 7.0 : f32
856  %m = memref.alloc() : memref<10xf32>
857  affine.parallel (%i0) = (0) to (%N) {
858    affine.store %cf7, %m[%i0] : memref<10xf32>
859    %v0 = affine.load %m[%i0] : memref<10xf32>
860    %v1 = arith.addf %v0, %v0 : f32
861  }
862  memref.dealloc %m : memref<10xf32>
863  return
864}
865// CHECK: func.func @non_constant_parallel_store_load(%[[ARG0:.*]]: index) {
866// CHECK-NEXT:  %[[C7:.*]] = arith.constant 7.000000e+00 : f32
867// CHECK-NEXT:  affine.parallel (%{{.*}}) = (0) to (%[[ARG0]]) {
868// CHECK-NEXT:    arith.addf %[[C7]], %[[C7]] : f32
869// CHECK-NEXT:  }
870// CHECK-NEXT:  return
871
872// CHECK-LABEL: func @parallel_surrounding_for() {
873func.func @parallel_surrounding_for() {
874  %cf7 = arith.constant 7.0 : f32
875  %m = memref.alloc() : memref<10x10xf32>
876  affine.parallel (%i0) = (0) to (10) {
877    affine.for %i1 = 0 to 10 {
878      affine.store %cf7, %m[%i0,%i1] : memref<10x10xf32>
879      %v0 = affine.load %m[%i0,%i1] : memref<10x10xf32>
880      %v1 = arith.addf %v0, %v0 : f32
881    }
882  }
883  memref.dealloc %m : memref<10x10xf32>
884  return
885// CHECK:       %[[C7:.*]] = arith.constant 7.000000e+00 : f32
886// CHECK-NEXT:  affine.parallel (%{{.*}}) = (0) to (10) {
887// CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
888// CHECK-NEXT:      arith.addf %[[C7]], %[[C7]] : f32
889// CHECK-NEXT:    }
890// CHECK-NEXT:  }
891// CHECK-NEXT:  return
892}
893
894// CHECK-LABEL: func.func @dead_affine_region_op
895func.func @dead_affine_region_op() {
896  %c1 = arith.constant 1 : index
897  %alloc = memref.alloc() : memref<15xi1>
898  %true = arith.constant true
899  affine.store %true, %alloc[%c1] : memref<15xi1>
900  // Dead store.
901  affine.store %true, %alloc[%c1] : memref<15xi1>
902  // This affine.if is dead.
903  affine.if affine_set<(d0, d1, d2, d3) : ((d0 + 1) mod 8 >= 0, d0 * -8 >= 0)>(%c1, %c1, %c1, %c1){
904    // No forwarding will happen.
905    affine.load %alloc[%c1] : memref<15xi1>
906  }
907  // CHECK-NEXT: arith.constant
908  // CHECK-NEXT: memref.alloc
909  // CHECK-NEXT: arith.constant
910  // CHECK-NEXT: affine.store
911  // CHECK-NEXT: affine.if
912  // CHECK-NEXT:   affine.load
913  return
914}
915
916// We perform no scalar replacement here since we don't depend on dominance
917// info, which would be needed in such cases when ops fall in different blocks
918// of a CFG region.
919
920// CHECK-LABEL: func @cross_block
921func.func @cross_block() {
922  %c10 = arith.constant 10 : index
923  %alloc_83 = memref.alloc() : memref<1x13xf32>
924  %alloc_99 = memref.alloc() : memref<13xi1>
925  %true_110 = arith.constant true
926  affine.store %true_110, %alloc_99[%c10] : memref<13xi1>
927  %true = arith.constant true
928  affine.store %true, %alloc_99[%c10] : memref<13xi1>
929  cf.br ^bb1(%alloc_83 : memref<1x13xf32>)
930^bb1(%35: memref<1x13xf32>):
931  // CHECK: affine.load
932  %69 = affine.load %alloc_99[%c10] : memref<13xi1>
933  return
934}
935
936#map1 = affine_map<(d0) -> (d0)>
937
938// CHECK-LABEL: func @consecutive_store
939func.func @consecutive_store() {
940  // CHECK: %[[CST:.*]] = arith.constant
941  %tmp = arith.constant 1.1 : f16
942  // CHECK: %[[ALLOC:.*]] = memref.alloc
943  %alloc_66 = memref.alloc() : memref<f16, 1>
944  affine.for %arg2 = 4 to 6 {
945    affine.for %arg3 = #map1(%arg2) to #map1(%arg2) step 4 {
946      // CHECK: affine.store %[[CST]], %[[ALLOC]][]
947      affine.store %tmp, %alloc_66[] : memref<f16, 1>
948      // CHECK-NOT: affine.store %[[CST]], %[[ALLOC]][]
949      affine.store %tmp, %alloc_66[] : memref<f16, 1>
950      %270 = affine.load %alloc_66[] : memref<f16, 1>
951    }
952  }
953  return
954}
955