xref: /llvm-project/mlir/test/Dialect/Affine/dma-generate.mlir (revision c8496d292e0ae4ba60b3905de884c83d66387e7e)
1// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-space=2 skip-non-unit-stride-loops" -verify-diagnostics | FileCheck %s
2// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-capacity=16 fast-mem-space=2" | FileCheck %s --check-prefix FAST-MEM-16KB
3
4// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing
5// DMA generation at inner levels easily - since the DMA generation would
6// otherwise always generate DMAs at the outermost level (default for fast mem
7// capacity is infinite). Using a specific capacity makes it harder to write
8// a test case as one would have to calculate total footprints. With
9// -copy-skip-non-unit-stride-loops, non-unit strides will always be skipped and
10// its inner loops will be traversed till a unit stride loop is found (or the
11// innermost block is reached).
12
13// -----
14
15// CHECK-LABEL: func @loop_nest_1d() {
16func.func @loop_nest_1d() {
17  %A = memref.alloc() : memref<256 x f32>
18  %B = memref.alloc() : memref<512 x f32>
19  %F = memref.alloc() : memref<256 x f32, 2>
20  // First DMA buffer.
21  // CHECK:  memref.alloc() : memref<256xf32>
22  // CHECK:  memref.alloc() : memref<256xf32, 2>
23  // Tag for first DMA.
24  // CHECK:  memref.alloc() : memref<1xi32>
25  // First DMA transfer.
26  // CHECK:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
27  // CHECK:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
28  // Second DMA buffer.
29  // CHECK:  memref.alloc() : memref<256xf32, 2>
30  // Tag for second DMA.
31  // CHECK:  memref.alloc() : memref<1xi32>
32  // Second DMA transfer.
33  // CHECK:       affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
34  // CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
35  // CHECK: affine.for %[[IV:.*]] = 0 to 256 {
36      // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
37      // Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32
38      // Affine map for load on B is composed and becomes identity.
39      // CHECK:      affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
40      // Already in faster memory space.
41      // CHECK:     affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
42  // CHECK-NEXT: }
43  // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
44  // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
45  // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
46  // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
47  // CHECK-NEXT: return
48  affine.for %i = 0 to 256 {
49    affine.load %A[%i] : memref<256 x f32>
50    %idx = affine.apply affine_map<(d0) -> (d0 + 256)>(%i)
51    affine.load %B[%idx] : memref<512 x f32>
52    affine.load %F[%i] : memref<256 x f32, 2>
53  }
54  return
55}
56
57// -----
58
59// CHECK-LABEL: func @loop_nest_high_d
60// CHECK:      %{{.*}} = arith.constant 16384 : index
61// CHECK-DAG:  [[BUFB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2>
62// CHECK-DAG:  [[BUFA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2>
63// CHECK-DAG:  [[BUFC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<512x32xf32, 2>
64// CHECK-DAG:  [[TAGB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
65// CHECK-DAG:  [[TAGA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
66// CHECK-DAG:  [[TAGC:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
67// CHECK-DAG:  [[TAGC_W:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
68// INCOMING DMA for B
69// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFB]][%{{.*}}, %{{.*}}], [[TAGB]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
70// CHECK-DAG:  affine.dma_wait [[TAGB]][%{{.*}}], %{{.*}} : memref<1xi32>
71// INCOMING DMA for A.
72// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFA]][%{{.*}}, %{{.*}}], [[TAGA]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
73// CHECK-DAG:  affine.dma_wait [[TAGA]][%{{.*}}], %{{.*}} : memref<1xi32>
74// INCOMING DMA for C.
75// CHECK-DAG:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], [[BUFC]][%{{.*}}, %{{.*}}], [[TAGC]][%{{.*}}], %{{.*}} : memref<512x32xf32>, memref<512x32xf32, 2>, memref<1xi32>
76// CHECK-DAG:  affine.dma_wait [[TAGC]][%{{.*}}], %{{.*}} : memref<1xi32>
77// CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
78// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
79// CHECK-NEXT:      affine.for %{{.*}} = 0 to 32 {
80// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
81// CHECK:               affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
82// CHECK-NEXT:          "foo"(%{{.*}}) : (f32) -> ()
83// CHECK-NEXT:        }
84// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
85// CHECK:               affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
86// CHECK-NEXT:          "bar"(%{{.*}}) : (f32) -> ()
87// CHECK-NEXT:        }
88// CHECK-NEXT:        affine.for %{{.*}} = 0 to 16 {
89// CHECK-NEXT:          "abc_compute"() : () -> f32
90// CHECK:               affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
91// CHECK-NEXT:          "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
92// CHECK-NEXT:          affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
93// CHECK-NEXT:        }
94// CHECK-NEXT:        "foobar"() : () -> ()
95// CHECK-NEXT:      }
96// CHECK-NEXT:    }
97// CHECK-NEXT:  }
98// OUTGOING DMA for C.
99// CHECK-NEXT:  affine.dma_start [[BUFC]][%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], [[TAGC_W]][%{{.*}}], %{{.*}} : memref<512x32xf32, 2>, memref<512x32xf32>, memref<1xi32>
100// CHECK-NEXT:  affine.dma_wait [[TAGC_W]][%{{.*}}], %{{.*}} : memref<1xi32>
101// CHECK-NEXT:  dealloc [[TAGC_W]] : memref<1xi32>
102// CHECK-NEXT:  dealloc [[TAGC]] : memref<1xi32>
103// CHECK-NEXT:  dealloc [[BUFC]] : memref<512x32xf32, 2>
104// CHECK-NEXT:  dealloc [[TAGA]] : memref<1xi32>
105// CHECK-NEXT:  dealloc [[BUFA]] : memref<512x32xf32, 2>
106// CHECK-NEXT:  dealloc [[TAGB]] : memref<1xi32>
107// CHECK-NEXT:  dealloc [[BUFB]] : memref<512x32xf32, 2>
108// CHECK-NEXT:  return
109// CHECK-NEXT:}
110func.func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
111    %B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) {
112  // DMAs will be performed at this level (jT is the first loop without a stride).
113  // A and B are read, while C is both read and written. A total of three new buffers
114  // are allocated and existing load's/store's are replaced by accesses to those buffers.
115  affine.for %jT = 0 to 32 {
116    affine.for %kT = 0 to 32 {
117      affine.for %iT = 0 to 32 {
118        affine.for %kk = 0 to 16 { // k intratile
119          %k = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)> (%kT, %kk)
120          %v0 = affine.load %B[%k, %jT] : memref<512 x 32 x f32>
121          "foo"(%v0) : (f32) -> ()
122        }
123        affine.for %ii = 0 to 16 { // i intratile.
124          %i = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)>(%iT, %ii)
125          %v1 = affine.load %A[%i, %kT] : memref<512 x 32 x f32>
126          "bar"(%v1) : (f32) -> ()
127        }
128        affine.for %ii_ = 0 to 16 { // i intratile.
129          %v2 = "abc_compute"() : () -> f32
130          %i_ = affine.apply affine_map<(d0, d1) -> (16*d0 + d1)>(%iT, %ii_)
131          %v3 =  affine.load %C[%i_, %jT] : memref<512 x 32 x f32>
132          %v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32)
133          affine.store %v4, %C[%i_, %jT] : memref<512 x 32 x f32>
134        }
135        "foobar"() : () -> ()
136      }
137    }
138  }
139  return
140}
141
142// -----
143
144// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2
145// region within a 256 x 8 memref.
146//
147// CHECK-LABEL: func @loop_nest_modulo() {
148// CHECK:       memref.alloc() : memref<256x8xf32>
149// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 step 4 {
150// CHECK:           memref.alloc() : memref<1x2xf32, 2>
151// CHECK-NEXT:      memref.alloc() : memref<1xi32>
152// Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol.
153// CHECK-NEXT:      affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
154// CHECK-NEXT:      affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
155// CHECK-NEXT:      affine.for %{{.*}} = 0 to 8 {
156//                    ...
157//                    ...
158// CHECK:           }
159// CHECK-NEXT:      dealloc %{{.*}} : memref<1xi32>
160// CHECK-NEXT:      dealloc %{{.*}} : memref<1x2xf32, 2>
161// CHECK-NEXT:    }
162// CHECK-NEXT:    return
163func.func @loop_nest_modulo() {
164  %A = memref.alloc() : memref<256 x 8 x f32>
165  affine.for %i = 0 to 32 step 4 {
166    // DMAs will be performed at this level (%j is the first unit stride loop)
167    affine.for %j = 0 to 8 {
168      %idx = affine.apply affine_map<(d0) -> (d0 mod 2)> (%j)
169      // A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8).
170      %v = affine.load %A[%i, %idx] : memref<256 x 8 x f32>
171    }
172  }
173  return
174}
175
176// -----
177
178// DMA on tiled loop nest. This also tests the case where the bounds are
179// dependent on outer loop IVs.
180// CHECK-LABEL: func @loop_nest_tiled() -> memref<256x1024xf32> {
181func.func @loop_nest_tiled() -> memref<256x1024xf32> {
182  %0 = memref.alloc() : memref<256x1024xf32>
183  affine.for %i0 = 0 to 256 step 32 {
184    affine.for %i1 = 0 to 1024 step 32 {
185// CHECK:      memref.alloc() : memref<32x32xf32, 2>
186// CHECK-NEXT: memref.alloc() : memref<1xi32>
187// Strided DMA here: 32 x 32 tile in a 256 x 1024 memref.
188// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<256x1024xf32>, memref<32x32xf32, 2>, memref<1xi32>
189// CHECK-NEXT: affine.dma_wait
190// CHECK-NEXT: affine.for %{{.*}} = #map
191// CHECK-NEXT:   affine.for %{{.*}} = #map
192      affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 32)>(%i0) {
193        affine.for %i3 = affine_map<(d0) -> (d0)>(%i1) to affine_map<(d0) -> (d0 + 32)>(%i1) {
194          // CHECK: affine.load %{{.*}}[-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<32x32xf32, 2>
195          %1 = affine.load %0[%i2, %i3] : memref<256x1024xf32>
196        } // CHECK-NEXT: }
197      }
198    }
199  }
200  return %0 : memref<256x1024xf32>
201}
202
203// -----
204
205// CHECK-LABEL: func @dma_constant_dim_access
206func.func @dma_constant_dim_access(%A : memref<100x100xf32>) {
207  %one = arith.constant 1 : index
208  %N = arith.constant 100 : index
209  // CHECK:      memref.alloc() : memref<1x100xf32, 2>
210  // CHECK-NEXT: memref.alloc() : memref<1xi32>
211  // No strided DMA needed here.
212  // CHECK:      affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}  : memref<100x100xf32>, memref<1x100xf32, 2>,
213  // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
214  affine.for %i = 0 to 100 {
215    affine.for %j = 0 to affine_map<()[s0] -> (s0)> ()[%N] {
216      // CHECK: affine.load %{{.*}}[0, %{{.*}}] : memref<1x100xf32, 2>
217      affine.load %A[%one, %j] : memref<100 x 100 x f32>
218    }
219  }
220  return
221}
222
223// -----
224
225// CHECK-LABEL: func @dma_with_symbolic_accesses
226func.func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
227  %N = arith.constant 9 : index
228  affine.for %i = 0 to 100 {
229    affine.for %j = 0 to 100 {
230      %idy = affine.apply affine_map<(d0, d1) [s0, s1] -> (d1 + s0 + s1)>(%i, %j)[%M, %N]
231      affine.load %A[%i, %idy] : memref<100 x 100 x f32>
232    }
233  }
234  return
235// CHECK:       memref.alloc() : memref<100x100xf32, 2>
236// CHECK-NEXT:  memref.alloc() : memref<1xi32>
237// CHECK-NEXT:  affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
238// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}}
239// CHECK-NEXT:  affine.for %[[IV0:.*]] = 0 to 100 {
240// CHECK-NEXT:    affine.for %[[IV1:.*]] = 0 to 100 {
241// CHECK:           affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2>
242// CHECK-NEXT:    }
243// CHECK-NEXT:  }
244// CHECK:       return
245}
246
247// -----
248
249// CHECK-LABEL: func @dma_with_symbolic_loop_bounds
250func.func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: index) {
251  %K = arith.constant 9 : index
252// The buffer size can't be bound by a constant smaller than the original
253// memref size; so the DMA buffer is the entire 100x100.
254// CHECK:       memref.alloc() : memref<100x100xf32, 2>
255// CHECK-NEXT:  memref.alloc() : memref<1xi32>
256// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<100x100xf32>, memref<100x100xf32, 2>, memref<1xi32>
257// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
258  affine.for %i = 0 to 100 {
259    affine.for %j = %M to %N {
260      %idy = affine.apply affine_map<(d1) [s0] -> (d1 + s0)>(%j)[%K]
261      affine.load %A[%i, %idy] : memref<100 x 100 x f32>
262    }
263  }
264  return
265}
266
267// -----
268
269// CHECK-LABEL: func @dma_unknown_size
270func.func @dma_unknown_size(%arg0: memref<?x?xf32>) {
271  %c0 = arith.constant 0 : index
272  %M = memref.dim %arg0, %c0 : memref<? x ? x f32>
273  %N = memref.dim %arg0, %c0 : memref<? x ? x f32>
274  affine.for %i = 0 to %M {
275    affine.for %j = 0 to %N {
276      // If this loop nest isn't tiled, the access requires a non-constant DMA
277      // size -- not yet implemented.
278      // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<?x?xf32>
279      affine.load %arg0[%i, %j] : memref<? x ? x f32>
280    }
281  }
282  return
283}
284
285// -----
286
287// CHECK-LABEL: func @dma_memref_3d
288func.func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
289  affine.for %i = 0 to 1024 {
290    affine.for %j = 0 to 1024 {
291      affine.for %k = 0 to 1024 {
292        %idx = affine.apply affine_map<(d0) -> (d0 mod 128)>(%i)
293        %idy = affine.apply affine_map<(d0) -> (d0 mod 128)>(%j)
294        %idz = affine.apply affine_map<(d0) -> (d0 mod 128)>(%k)
295        // DMA with nested striding (or emulating with loop around strided DMA)
296        // not yet implemented.
297        // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] : memref<1024x1024x1024xf32>
298        %v = affine.load %arg0[%idx, %idy, %idz] : memref<1024 x 1024 x 1024 x f32>
299      }
300    }
301  }
302  return
303}
304
305// -----
306
307// The first load accesses ([2,258), [128,384))
308// The second load accesses ([64,320), [2,258))
309// The first store writes to ([2,258), [192,448))
310// The second store writes to ([128,320), [2,258))
311// The union of all these regions is of size 318 x 446 and has its origin at (2,
312// 2), i.e., the window ([2,320), [2,448)) in the original space.
313
314// CHECK-LABEL: func @multi_load_store_union() {
315func.func @multi_load_store_union() {
316  %A = memref.alloc() : memref<512 x 512 x f32>
317  affine.for %i = 0 to 256 {
318    affine.for %j = 0 to 256 {
319      %idx = affine.apply affine_map<(d0) -> (d0 + 64)>(%i)
320      %idy = affine.apply affine_map<(d0) -> (d0 + 128)>(%j)
321      %ishift = affine.apply affine_map<(d0) -> (d0 + 2)>(%i)
322      %jshift = affine.apply affine_map<(d0) -> (d0 + 2)>(%j)
323
324      %u = affine.load %A[%ishift, %idy] : memref<512 x 512 x f32>
325      %v = affine.load %A[%idx, %jshift] : memref<512 x 512 x f32>
326
327      %sidx = affine.apply affine_map<(d0) -> (d0 + 128)>(%i)
328      %sidy = affine.apply affine_map<(d0) -> (d0 + 192)>(%j)
329
330      affine.store %u, %A[%ishift, %sidy] : memref<512 x 512 x f32>
331      affine.store %v, %A[%sidx, %jshift] : memref<512 x 512 x f32>
332    }
333  }
334  return
335}
336// CHECK:       memref.alloc() : memref<512x512xf32>
337// CHECK-NEXT:  memref.alloc() : memref<382x446xf32, 2>
338// CHECK-NEXT:  memref.alloc() : memref<1xi32>
339// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<512x512xf32>, memref<382x446xf32, 2>, memref<1xi32>
340// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
341// CHECK-NEXT:  memref.alloc() : memref<1xi32>
342// CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
343// CHECK-NEXT:    affine.for %{{.*}} = 0 to 256 {
344// CHECK:           affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
345// CHECK-NEXT:      affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
346// CHECK:           affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
347// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2>
348// CHECK-NEXT:    }
349// CHECK-NEXT:  }
350// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}, %{{.*}} : memref<382x446xf32, 2>, memref<512x512xf32>, memref<1xi32>
351// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
352// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
353// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
354// CHECK-NEXT:  dealloc %{{.*}} : memref<382x446xf32, 2>
355// CHECK-NEXT:  return
356// CHECK-NEXT:}
357
358// -----
359
360// CHECK-LABEL: func @dma_loop_straightline_interspersed() {
361func.func @dma_loop_straightline_interspersed() {
362  %c0 = arith.constant 0 : index
363  %c255 = arith.constant 255 : index
364  %A = memref.alloc() : memref<256 x f32>
365  %v = affine.load %A[%c0] : memref<256 x f32>
366  affine.for %i = 1 to 255 {
367    affine.load %A[%i] : memref<256 x f32>
368  }
369  %l = affine.load %A[%c255] : memref<256 x f32>
370  affine.store %l, %A[%c0] : memref<256 x f32>
371  return
372}
373// There are three regions here - the 'load' preceding the loop, the loop
374// itself, and the operations appearing after the scf.
375// CHECK:       memref.alloc() : memref<256xf32>
376// CHECK-NEXT:  memref.alloc() : memref<1xf32, 2>
377// CHECK-NEXT:  memref.alloc() : memref<1xi32>
378// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<1xf32, 2>, memref<1xi32>
379// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
380// CHECK-NEXT:  affine.load %{{.*}}[0] : memref<1xf32, 2>
381// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
382// CHECK-NEXT:  dealloc %{{.*}} : memref<1xf32, 2>
383// CHECK-NEXT:  memref.alloc() : memref<254xf32, 2>
384// CHECK-NEXT:  memref.alloc() : memref<1xi32>
385// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<254xf32, 2>, memref<1xi32>
386// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
387// CHECK-NEXT:  affine.for %{{.*}} = 1 to 255 {
388// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} - 1] : memref<254xf32, 2>
389// CHECK-NEXT:  }
390// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
391// CHECK-NEXT:  dealloc %{{.*}} : memref<254xf32, 2>
392// CHECK-NEXT:  memref.alloc() : memref<256xf32, 2>
393// CHECK-NEXT:  memref.alloc() : memref<1xi32>
394// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<256xf32, 2>, memref<1xi32>
395// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
396// CHECK-NEXT:  memref.alloc() : memref<1xi32>
397// CHECK-NEXT:  affine.load %{{.*}}[255] : memref<256xf32, 2>
398// CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[0] : memref<256xf32, 2>
399// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32, 2>, memref<256xf32>, memref<1xi32>
400// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
401// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
402// CHECK-NEXT:  dealloc %{{.*}} : memref<1xi32>
403// CHECK-NEXT:  dealloc %{{.*}} : memref<256xf32, 2>
404// CHECK-NEXT:  return
405
406// -----
407
408// CHECK-LABEL: func @dma_mixed_loop_blocks() {
409func.func @dma_mixed_loop_blocks() {
410  %c0 = arith.constant 0 : index
411  %A = memref.alloc() : memref<256 x 256 x vector<8 x f32>>
412  affine.for %i = 0 to 256 {
413    %v = affine.load %A[%c0, %c0] : memref<256 x 256 x vector<8 x f32>>
414    "foo"(%v) : (vector<8 x f32>) -> ()
415    affine.for %j = 0 to 256 {
416      %w = affine.load %A[%i, %j] : memref<256 x 256 x vector<8 x f32>>
417      "bar"(%w) : (vector<8 x f32>) -> ()
418    }
419  }
420  return
421}
422// CHECK-DAG:   [[MEM:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<256x256xvector<8xf32>>
423// CHECK-DAG:   [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<256x256xvector<8xf32>, 2>
424// CHECK-DAG:   [[TAG:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
425// CHECK:       affine.dma_start [[MEM]][%{{.*}}, %{{.*}}], [[BUF]][%{{.*}}, %{{.*}}], [[TAG]][%{{.*}}], %{{.*}} : memref<256x256xvector<8xf32>>, memref<256x256xvector<8xf32>, 2>, memref<1xi32>
426// CHECK-NEXT:  affine.dma_wait [[TAG]][%{{.*}}], %{{.*}} : memref<1xi32>
427// CHECK-NEXT:  affine.for %{{.*}} = 0 to 256 {
428// CHECK:         affine.load [[BUF]][0, 0] : memref<256x256xvector<8xf32>, 2>
429// CHECK:         affine.for %{{.*}} = 0 to 256 {
430// CHECK-NEXT:      affine.load [[BUF]][%{{.*}}, %{{.*}}] : memref<256x256xvector<8xf32>, 2>
431
432// -----
433
434// CHECK-LABEL: func @relative_loop_bounds
435func.func @relative_loop_bounds(%arg0: memref<1027xf32>) {
436  affine.for %i0 = 0 to 1024 {
437    affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 4)>(%i0) {
438      %0 = arith.constant 0.0 : f32
439      affine.store %0, %arg0[%i2] : memref<1027xf32>
440    }
441  }
442  return
443}
444// CHECK:      [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1027xf32, 2>
445// CHECK-NEXT: [[MEM:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xi32>
446// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
447// CHECK-NEXT:    affine.for %[[I2:.*]] = {{#map[0-9a-zA-Z_]*}}(%{{.*}}) to {{#map[0-9a-zA-Z_]*}}(%{{.*}}) {
448// CHECK:           affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2>
449// CHECK-NEXT:    }
450// CHECK-NEXT:  }
451// CHECK-NEXT:  affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}}  : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>
452// CHECK-NEXT:  affine.dma_wait [[MEM]][%{{.*}}], %{{.*}} : memref<1xi32>
453
454// -----
455
456func.func @test_read_write_region_union() {
457  %0 = memref.alloc() : memref<256xf32>
458  affine.for %i0 = 0 to 10 {
459    // memref dims:  [0, 256)
460    // read region:  [100, 110)
461    // write region: [25, 35)
462    // union region: [25, 110)
463    %a0 = affine.apply affine_map<(d0) -> (d0 + 100)>(%i0)
464    %a1 = affine.apply affine_map<(d0) -> (d0 + 25)>(%i0)
465    %1 = affine.load %0[%a0] : memref<256xf32>
466    affine.store %1, %0[%a1] : memref<256xf32>
467  }
468  return
469}
470
471// CHECK:       memref.alloc() : memref<256xf32>
472// CHECK-NEXT:  memref.alloc() : memref<85xf32, 2>
473// CHECK-NEXT:  memref.alloc() : memref<1xi32>
474// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256xf32>, memref<85xf32, 2>, memref<1xi32>
475// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
476// CHECK-NEXT:  memref.alloc() : memref<1xi32>
477// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
478// CHECK:         affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
479// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2>
480// CHECK-NEXT:  }
481// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
482// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
483
484// -----
485
486// This should create a buffer of size 2 affine.for %arg2.
487
488#map_lb = affine_map<(d0) -> (d0)>
489#map_ub = affine_map<(d0) -> (d0 + 3)>
490#map_acc = affine_map<(d0) -> (d0 floordiv 8)>
491// CHECK-LABEL: func @test_analysis_util
492func.func @test_analysis_util(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<2xf32>) -> (memref<144x9xf32>, memref<2xf32>) {
493  %c0 = arith.constant 0 : index
494  %0 = memref.alloc() : memref<64x1xf32>
495  %1 = memref.alloc() : memref<144x4xf32>
496  %2 =  arith.constant 0.0 : f32
497  affine.for %i8 = 0 to 9 step 3 {
498    affine.for %i9 = #map_lb(%i8) to #map_ub(%i8) {
499      affine.for %i17 = 0 to 64 {
500        %23 = affine.apply #map_acc(%i9)
501        %25 = affine.load %arg2[%23] : memref<2xf32>
502        %26 = affine.apply #map_lb(%i17)
503        %27 = affine.load %0[%26, %c0] : memref<64x1xf32>
504        affine.store %27, %arg2[%23] : memref<2xf32>
505      }
506    }
507  }
508  return %arg1, %arg2 : memref<144x9xf32>, memref<2xf32>
509}
510// CHECK:       affine.for %{{.*}} = 0 to 9 step 3 {
511// CHECK:         [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2xf32, 2>
512// CHECK:         affine.dma_start %{{.*}}[%{{.*}} floordiv 8], [[BUF]]
513// CHECK:         affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
514// CHECK:         affine.for %{{.*}} =
515
516// -----
517
518#map3 = affine_map<(d0) -> (d0)>
519#map12 = affine_map<(d0) -> (d0 + 3)>
520#map14 = affine_map<(d0, d1) -> ((d0 + d1 * 72) floordiv 2304 + ((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3)>
521#map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)>
522#map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)>
523// Test for test case in b/128303048 #4.
524// CHECK-LABEL: func @test_memref_bounds
525func.func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) {
526  %c0 = arith.constant 0 : index
527  affine.for %i8 = 0 to 9 step 3 {
528    affine.for %i9 = #map3(%i8) to #map12(%i8) {
529      affine.for %i10 = 0 to 64 {
530        %10 = affine.apply #map14(%i9, %i10)
531        %11 = affine.apply #map15(%i9, %i10)
532        %12 = affine.apply #map16(%i9, %i10)
533        %13 = affine.load %arg0[%10, %11, %12, %c0] : memref<4x4x16x1xvector<8x128xf32>>
534      }
535    }
536  }
537  return %arg1, %arg2 : memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>
538}
539
540// CHECK:       memref.alloc() : memref<4x4x16x1xvector<8x128xf32>, 2>
541// CHECK-NEXT:  memref.alloc() : memref<1xi32>
542// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<4x4x16x1xvector<8x128xf32>>, memref<4x4x16x1xvector<8x128xf32>, 2>, memref<1xi32>
543// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
544
545// -----
546
547// Since the fast memory size is 4 KB, DMA generation will happen right under
548// %i0.
549
550// FAST-MEM-16KB-LABEL: func @load_store_same_memref
551func.func @load_store_same_memref(%arg0: memref<256x1024xf32>) {
552  // FAST-MEM-16KB:  affine.for %{{.*}} = 0 to 256 step 4
553  affine.for %i0 = 0 to 256 step 4 {
554    // FAST-MEM-16KB: [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<4x1024xf32, 2>
555    // FAST-MEM-16KB:    affine.dma_start %{{.*}}
556    // FAST-MEM-16KB-NEXT: affine.dma_wait
557    // FAST-MEM-16KB:  affine.for %{{.*}}
558    affine.for %i1 = 0 to 1024 step 4 {
559      // FAST-MEM-16KB:  affine.for %{{.*}}
560      affine.for %i2 = affine_map<(d0) -> (d0)>(%i0) to affine_map<(d0) -> (d0 + 4)>(%i0) {
561        // FAST-MEM-16KB:  affine.for %{{.*}}
562        affine.for %i3 = affine_map<(d0) -> (d0)>(%i1) to affine_map<(d0) -> (d0 + 4)>(%i1) {
563          %3 = affine.load %arg0[%i2, %i3] : memref<256x1024xf32>
564          %4 = arith.mulf %3, %3 : f32
565          affine.store %4, %arg0[%i2, %i3] : memref<256x1024xf32>
566        } // FAST-MEM-16KB: }
567      } // FAST-MEM-16KB: }
568    } // FAST-MEM-16KB: }
569    // FAST-MEM-16KB:    affine.dma_start [[BUF]]
570    // FAST-MEM-16KB-NEXT: affine.dma_wait
571  }
572  return
573}
574
575// -----
576
577// This a 3-d loop nest tiled by 4 x 4 x 4. Under %i, %j, %k, the size of a
578// tile of arg0, arg1, and arg2 accessed is 4 KB (each), i.e., 12 KB in total.
579// With fast mem capacity set to 16 KB, the DMAs if placed under %k will fit.
580// However, the region of arg2 accessed is invariant w.r.t the %k loop unlike
581// %arg0 and %arg1. So, its DMA can be hoisted one level up and placed under
582// %j, while the DMAs for arg0 and arg1 appear right under the %k scf.
583
584#map0 = affine_map<(d0) -> (d0)>
585#map1 = affine_map<(d0) -> (d0 + 4)>
586// FAST-MEM-16KB-LABEL: func @simple_matmul
587func.func @simple_matmul(%arg0: memref<8x8xvector<64xf32>>, %arg1: memref<8x8xvector<64xf32>>, %arg2: memref<8x8xvector<64xf32>>) -> memref<8x8xvector<64xf32>> {
588  affine.for %i = 0 to 8 step 4 {
589    affine.for %j = 0 to 8 step 4 {
590      affine.for %k = 0 to 8 step 4 {
591        affine.for %ii = #map0(%i) to #map1(%i) {
592          affine.for %jj = #map0(%j) to #map1(%j) {
593            affine.for %kk = #map0(%k) to #map1(%k) {
594              %5 = affine.load %arg0[%ii, %kk] : memref<8x8xvector<64xf32>>
595              %6 = affine.load %arg1[%kk, %jj] : memref<8x8xvector<64xf32>>
596              %7 = affine.load %arg2[%ii, %jj] : memref<8x8xvector<64xf32>>
597              %8 = arith.mulf %5, %6 : vector<64xf32>
598              %9 = arith.addf %7, %8 : vector<64xf32>
599              affine.store %9, %arg2[%ii, %jj] : memref<8x8xvector<64xf32>>
600            }
601          }
602        }
603      }
604    }
605  }
606  return %arg2 : memref<8x8xvector<64xf32>>
607}
608// FAST-MEM-16KB: affine.for %{{.*}} = 0 to 8 step 4 {
609// FAST-MEM-16KB:   affine.for %{{.*}} = 0 to 8 step 4 {
610// FAST-MEM-16KB:     affine.dma_start %{{.*}}
611// FAST-MEM-16KB:     affine.dma_wait
612// FAST-MEM-16KB:     affine.for %{{.*}} = 0 to 8 step 4 {
613// FAST-MEM-16KB:       affine.dma_start %{{.*}}
614// FAST-MEM-16KB:       affine.dma_wait
615// FAST-MEM-16KB:       affine.dma_start %{{.*}}
616// FAST-MEM-16KB:       affine.dma_wait
617// FAST-MEM-16KB:       affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) {
618// FAST-MEM-16KB-NEXT:    affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) {
619// FAST-MEM-16KB-NEXT:      affine.for %{{.*}} = #map{{[0-9a-zA-Z_]*}}(%{{.*}}) to #map{{[0-9a-zA-Z_]*}}(%{{.*}}) {
620// FAST-MEM-16KB:           }
621// FAST-MEM-16KB:         }
622// FAST-MEM-16KB:       }
623// FAST-MEM-16KB:     }
624// FAST-MEM-16KB:     affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}
625// FAST-MEM-16KB:     affine.dma_wait
626