xref: /llvm-project/mlir/test/Dialect/Affine/pipeline-data-transfer.mlir (revision b36de52c98523e37c65b9d8a4424dbe9e6ea5c8d)
1// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-pipeline-data-transfer | FileCheck %s
2
3// -----
4
5// CHECK-DAG: [[$MOD_2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 2)>
6// CHECK-DAG: [[$MAP_MINUS_1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 1)>
7
8// CHECK-LABEL: func @loop_nest_dma() {
9func.func @loop_nest_dma() {
10
11  %A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
12  %Ah = memref.alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
13
14  %tag = memref.alloc() : memref<1 x f32>
15
16  %zero = arith.constant 0 : index
17  %num_elts = arith.constant 32 : index
18
19  affine.for %i = 0 to 8 {
20    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
21    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
22    %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
23    %r = "compute"(%v) : (f32) -> (f32)
24    affine.store %r, %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
25    affine.for %j = 0 to 32 {
26      "do_more_compute"(%i, %j) : (index, index) -> ()
27    }
28  }
29  memref.dealloc %tag : memref<1 x f32>
30  memref.dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
31  return
32}
33// CHECK:       %{{.*}} = memref.alloc() : memref<256xf32>
34// CHECK:       %{{.*}} = memref.alloc() : memref<2x32xf32, 1>
35// CHECK-NEXT:  %{{.*}} = memref.alloc() : memref<2x1xf32>
36// CHECK-NEXT:  affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
37// CHECK-NEXT:  affine.for %{{.*}} = 1 to 8 {
38// CHECK-NEXT:    affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32>
39// CHECK-NEXT:    affine.apply [[$MAP_MINUS_1]](%{{.*}})
40// CHECK-NEXT:    affine.apply [[$MOD_2]](%{{.*}})
41// CHECK-NEXT:    affine.apply [[$MOD_2]](%{{.*}})
42// CHECK-NEXT:    affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
43// CHECK-NEXT:    affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
44// CHECK-NEXT:    "compute"(%{{.*}}) : (f32) -> f32
45// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
46// CHECK-NEXT:    affine.for %{{.*}} = 0 to 32 {
47// CHECK-NEXT:      "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
48// CHECK-NEXT:    }
49// CHECK-NEXT:  }
50// CHECK-NEXT:  affine.apply [[$MAP_MINUS_1]](%{{.*}})
51// CHECK-NEXT:  affine.apply [[$MOD_2]](%{{.*}})
52// CHECK-NEXT:  affine.apply [[$MOD_2]](%{{.*}})
53// CHECK-NEXT:  affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32>
54// CHECK-NEXT:  affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
55// CHECK-NEXT:  "compute"(%{{.*}}) : (f32) -> f32
56// CHECK-NEXT:  affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1>
57// CHECK-NEXT:  affine.for %{{.*}} = 0 to 32 {
58// CHECK-NEXT:    "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> ()
59// CHECK-NEXT:  }
60// CHECK-NEXT:  memref.dealloc %{{.*}} : memref<2x1xf32>
61// CHECK-NEXT:  memref.dealloc %{{.*}} : memref<2x32xf32, 1>
62// CHECK-NEXT:  return
63// CHECK-NEXT:}
64
65// -----
66
67// CHECK-DAG: [[$FLOOR_MOD_2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 floordiv 4) mod 2)>
68// CHECK-DAG: [[$REMAP_SHIFT_MINUS_4:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 4)>
69
70// CHECK-LABEL: @loop_step
71func.func @loop_step(%arg0: memref<512xf32>,
72                  %arg1: memref<512xf32>) {
73  %c0 = arith.constant 0 : index
74  %c4 = arith.constant 4 : index
75  affine.for %i0 = 0 to 512 step 4 {
76    %1 = memref.alloc() : memref<4xf32, 1>
77    %2 = memref.alloc() : memref<1xi32>
78    affine.dma_start %arg0[%i0], %1[%c0], %2[%c0], %c4,
79              : memref<512xf32>, memref<4xf32, 1>, memref<1xi32>
80    affine.dma_wait %2[%c0], %c4 : memref<1xi32>
81    "compute"(%i0) : (index) -> ()
82    memref.dealloc %2 : memref<1xi32>
83    memref.dealloc %1 : memref<4xf32, 1>
84  }
85  return
86}
87// CHECK:        [[BUF:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x4xf32, 1>
88// CHECK:        [[TAG:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x1xi32>
89// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
90// CHECK-NEXT:   affine.for %{{.*}} = 4 to 512 step 4 {
91// CHECK-NEXT:     affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32>
92// CHECK-NEXT:     affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
93// CHECK-NEXT:     affine.apply [[$FLOOR_MOD_2]](%{{.*}})
94// CHECK:          affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
95// CHECK-NEXT:     "compute"(%{{.*}}) : (index) -> ()
96// CHECK-NEXT:   }
97// CHECK-NEXT:   [[SHIFTED:%[0-9a-zA-Z_]+]] = affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}})
98// CHECK-NEXT:   %{{.*}} = affine.apply [[$FLOOR_MOD_2]]([[SHIFTED]])
99// CHECK:        affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32>
100// CHECK-NEXT:   "compute"(%{{.*}}) : (index) -> ()
101// CHECK-NEXT:   memref.dealloc [[TAG]] : memref<2x1xi32>
102// CHECK-NEXT:   memref.dealloc [[BUF]] : memref<2x4xf32, 1>
103// CHECK-NEXT:   return
104// CHECK-NEXT: }
105
106// -----
107
108#map1 = affine_map<(d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32)>
109#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
110// CHECK-LABEL: func @loop_dma_nested(%{{.*}}: memref<512x32xvector<8xf32>
111func.func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>>, %arg1: memref<512x32xvector<8xf32>>, %arg2: memref<512x32xvector<8xf32>>) {
112  %num_elts = arith.constant 256 : index
113  %c0 = arith.constant 0 : index
114  %0 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
115  %1 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
116  %2 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
117  %3 = memref.alloc() : memref<2xi32>
118  %4 = memref.alloc() : memref<2xi32>
119  %5 = memref.alloc() : memref<2xi32>
120  // Prologue for DMA overlap on arg2.
121  // CHECK-DAG: [[BUF_ARG2:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
122  // CHECK-DAG: [[TAG_ARG2:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
123  // CHECK: affine.dma_start %{{.*}}[
124  // CHECK: affine.for %{{.*}} = 1 to 8 {
125  affine.for %i0 = 0 to 8 {
126    %6 = affine.apply #map2(%i0)
127    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
128    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
129    // Steady state for DMA overlap on arg2
130    // CHECK: affine.dma_start %{{.*}}[
131    // CHECK: affine.dma_wait [[TAG_ARG2]]
132    // Prologue for DMA overlap on arg0, arg1 nested within i0
133    // CHECK: [[BUF_ARG0:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
134    // CHECK: [[BUF_ARG1:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
135    // CHECK: [[TAG_ARG0:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
136    // CHECK: [[TAG_ARG1:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
137    // CHECK: affine.dma_start %{{.*}}[
138    // CHECK: affine.dma_start %{{.*}}[
139    // CHECK-NEXT: affine.for %{{.*}} = 1 to 8 {
140    affine.for %i1 = 0 to 8 {
141      %7 = affine.apply #map1(%i0, %i1)
142      %8 = affine.apply #map2(%i1)
143      affine.dma_start %arg0[%7, %c0], %0[%c0, %c0], %3[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
144      affine.dma_start %arg1[%8, %c0], %1[%c0, %c0], %4[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
145      affine.dma_wait %3[%c0], %num_elts : memref<2xi32>
146      affine.dma_wait %4[%c0], %num_elts : memref<2xi32>
147      // Steady state for DMA overlap on arg0, arg1
148      // CHECK: affine.dma_start %{{.*}}[
149      // CHECK: affine.dma_start %{{.*}}[
150      // CHECK: affine.dma_wait [[TAG_ARG0]]
151      // CHECK: affine.dma_wait [[TAG_ARG1]]
152      // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
153      affine.for %i2 = 0 to 4 {
154        "foo"() : () -> ()
155      }
156    }
157    // epilogue for arg0, arg1
158    // CHECK: affine.dma_wait [[TAG_ARG0]]
159    // CHECK: affine.dma_wait [[TAG_ARG1]]
160    // CHECK-DAG:    memref.dealloc [[TAG_ARG1]] : memref<2x2xi32>
161    // CHECK-DAG:    memref.dealloc [[TAG_ARG0]] : memref<2x2xi32>
162    // CHECK-DAG:    memref.dealloc [[BUF_ARG1]] : memref<2x64x4xvector<8xf32>, 2>
163    // CHECK-DAG:    memref.dealloc [[BUF_ARG0]] : memref<2x64x4xvector<8xf32>, 2>
164  // epilogue for DMA overlap on %arg2
165  // CHECK:  affine.dma_wait [[TAG_ARG2]]
166  // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested.
167  // CHECK: [[BUF_ARG0_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
168  // CHECK: [[BUF_ARG1_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x64x4xvector<8xf32>, 2>
169  // CHECK: [[TAG_ARG0_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
170  // CHECK: [[TAG_ARG1_NESTED:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<2x2xi32>
171  // CHECK:  affine.dma_start %{{.*}}[
172  // CHECK:  affine.dma_start %{{.*}}[
173  // CHECK:  affine.for %{{.*}} = 1 to 8 {
174  // CHECK:    affine.dma_start %{{.*}}[
175  // CHECK:    affine.dma_start %{{.*}}[
176  // CHECK:    affine.dma_wait [[TAG_ARG0_NESTED]]
177  // CHECK:    affine.dma_wait [[TAG_ARG1_NESTED]]
178  // CHECK:    affine.for %{{.*}} = 0 to 4 {
179  // CHECK:      "foo"() : () -> ()
180  // CHECK:  affine.dma_wait [[TAG_ARG0_NESTED]]
181  // CHECK:  affine.dma_wait [[TAG_ARG1_NESTED]]
182  // CHECK:  affine.for %{{.*}} = 0 to 4 {
183  }
184  memref.dealloc %5 : memref<2xi32>
185  memref.dealloc %4 : memref<2xi32>
186  memref.dealloc %3 : memref<2xi32>
187  memref.dealloc %2 : memref<64x4xvector<8xf32>, 2>
188  memref.dealloc %1 : memref<64x4xvector<8xf32>, 2>
189  memref.dealloc %0 : memref<64x4xvector<8xf32>, 2>
190  return
191// CHECK: }
192// CHECK-DAG:  memref.dealloc [[TAG_ARG1_NESTED]] : memref<2x2xi32>
193// CHECK-DAG:  memref.dealloc [[TAG_ARG0_NESTED]] : memref<2x2xi32>
194// CHECK-DAG:  memref.dealloc [[BUF_ARG1_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
195// CHECK-DAG:  memref.dealloc [[BUF_ARG0_NESTED]] : memref<2x64x4xvector<8xf32>, 2>
196// CHECK-DAG:  memref.dealloc [[TAG_ARG2]] : memref<2x2xi32>
197// CHECK-DAG:  memref.dealloc [[BUF_ARG2]] : memref<2x64x4xvector<8xf32>, 2>
198// CHECK-NEXT: return
199}
200
201// -----
202#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)>
203
204// CHECK: func @loop_dma_dependent
205func.func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) {
206  %num_elts = arith.constant 256 : index
207  %c0 = arith.constant 0 : index
208  %0 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
209  %1 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
210  %2 = memref.alloc() : memref<64x4xvector<8xf32>, 2>
211  %3 = memref.alloc() : memref<2xi32>
212  %4 = memref.alloc() : memref<2xi32>
213  %5 = memref.alloc() : memref<2xi32>
214
215  // The two DMAs below are dependent (incoming and outgoing on the same
216  // memref) in the same iteration; so no pipelining here.
217  // CHECK-NOT: affine.dma_start
218  // CHECK: affine.for %{{.*}} = 0 to 8 {
219  affine.for %i0 = 0 to 8 {
220    %6 = affine.apply #map2(%i0)
221    affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32>
222    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
223
224    affine.dma_start %2[%c0, %c0], %arg2[%6, %c0], %5[%c0], %num_elts : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32>
225    affine.dma_wait %5[%c0], %num_elts : memref<2xi32>
226  }
227  memref.dealloc %5 : memref<2xi32>
228  memref.dealloc %4 : memref<2xi32>
229  memref.dealloc %3 : memref<2xi32>
230  memref.dealloc %2 : memref<64x4xvector<8xf32>, 2>
231  memref.dealloc %1 : memref<64x4xvector<8xf32>, 2>
232  memref.dealloc %0 : memref<64x4xvector<8xf32>, 2>
233  return
234}
235
236// -----
237
238// CHECK-LABEL: func @escaping_use
239func.func @escaping_use(%arg0: memref<512 x 32 x f32>) {
240  %c32 = arith.constant 32 : index
241  %num_elt = arith.constant 512 : index
242  %zero = arith.constant 0 : index
243  %Av = memref.alloc() : memref<32 x 32 x f32, 2>
244  %tag = memref.alloc() : memref<1 x i32>
245
246  // CHECK-NOT: affine.dma_start
247  // CHECK: affine.for %{{.*}} = 0 to 16 {
248  affine.for %kTT = 0 to 16 {
249    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
250      memref<512 x 32 x f32>,
251      memref<32 x 32 x f32, 2>, memref<1 x i32>
252    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
253    // escaping use; no DMA pipelining / double buffering will be done.
254    "foo"(%Av) : (memref<32 x 32 x f32, 2>) -> ()
255  }
256  memref.dealloc %tag : memref<1 x i32>
257  memref.dealloc %Av : memref<32 x 32 x f32, 2>
258  return
259// CHECK:        "foo"(%{{[0-9a-zA-Z_]+}}) : (memref<32x32xf32, 2>) -> ()
260// CHECK:      }
261// CHECK:      return
262}
263
264// -----
265
266// CHECK-LABEL: func @escaping_tag
267func.func @escaping_tag(%arg0: memref<512 x 32 x f32>) {
268  %c32 = arith.constant 32 : index
269  %num_elt = arith.constant 512 : index
270  %zero = arith.constant 0 : index
271  %Av = memref.alloc() : memref<32 x 32 x f32, 2>
272  %tag = memref.alloc() : memref<1 x i32>
273
274  // CHECK-NOT: affine.dma_start
275  // CHECK: affine.for %{{.*}} = 0 to 16 {
276  affine.for %kTT = 0 to 16 {
277    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
278      memref<512 x 32 x f32>,
279      memref<32 x 32 x f32, 2>, memref<1 x i32>
280    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
281    // escaping use; no DMA pipelining / double buffering will be done.
282    "foo"(%tag) : (memref<1 x i32>) -> ()
283  }
284  memref.dealloc %tag : memref<1 x i32>
285  memref.dealloc %Av : memref<32 x 32 x f32, 2>
286  return
287// CHECK:        "foo"(%{{[0-9a-zA-Z_]+}}) : (memref<1xi32>) -> ()
288// CHECK:      }
289// CHECK:      return
290}
291
292
293// -----
294
295// CHECK-LABEL: func @live_out_use
296func.func @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 {
297  %c32 = arith.constant 32 : index
298  %num_elt = arith.constant 512 : index
299  %zero = arith.constant 0 : index
300  %Av = memref.alloc() : memref<32 x 32 x f32, 2>
301  %tag = memref.alloc() : memref<1 x i32>
302
303  // CHECK-NOT: affine.dma_start
304  // CHECK: affine.for %{{.*}} = 0 to 16 {
305  affine.for %kTT = 0 to 16 {
306    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
307      memref<512 x 32 x f32>,
308      memref<32 x 32 x f32, 2>, memref<1 x i32>
309    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
310  }
311  // Use live out of 'affine.for' op; no DMA pipelining will be done.
312  %v = affine.load %Av[%zero, %zero] : memref<32 x 32 x f32, 2>
313  memref.dealloc %tag : memref<1 x i32>
314  memref.dealloc %Av : memref<32 x 32 x f32, 2>
315  return %v : f32
316// CHECK:      affine.load %{{[0-9a-zA-Z_]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2>
317// CHECK:      return
318}
319
320// -----
321
322// CHECK-LABEL: func @dynamic_shape_dma_buffer
323func.func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>, %Av: memref<? x ? x f32, 2>) {
324  %num_elt = arith.constant 512 : index
325  %zero = arith.constant 0 : index
326  %tag = memref.alloc() : memref<1 x i32>
327
328// Double buffering for dynamic shaped buffer.
329// Note: Cannot capture C0 because there are multiple C0 constants in the IR.
330// CHECK:       memref.dim %{{.*}}, %{{.*}} : memref<?x?xf32, 2>
331// CHECK-NEXT:  %[[C1:.*]] = arith.constant 1 : index
332// CHECK-NEXT:  memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32, 2>
333// CHECK-NEXT:  memref.alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2>
334// CHECK:       affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
335  affine.for %kTT = 0 to 16 {
336    affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt :
337      memref<512 x 32 x f32>,
338      memref<? x ? x f32, 2>, memref<1 x i32>
339    affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32>
340  }
341  return
342// CHECK-NEXT:  affine.for %{{.*}} = 1 to 16 {
343// CHECK:         affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}}
344// CHECK:         affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
345// CHECK:       }
346// CHECK:       affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32>
347// CHECK:       return
348}
349
350// Memref replacement will fail here due to a non-dereferencing use. However,
351// no incorrect transformation is performed in spite of one of the uses being a
352// dereferencing one since replaceAllMemRefUsesWith checks for escaping uses
353// before performing any replacement.
354// CHECK-LABEL: func @escaping_and_indexed_use_mix
355func.func @escaping_and_indexed_use_mix() {
356  %A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
357  %Ah = memref.alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
358  %tag = memref.alloc() : memref<1 x f32>
359  %zero = arith.constant 0 : index
360  %num_elts = arith.constant 32 : index
361
362  // alloc for the buffer is created but no replacement should happen.
363  affine.for %i = 0 to 8 {
364    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
365    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
366    "compute"(%Ah) : (memref<32 x f32, 1>) -> ()
367    %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
368    "foo"(%v) : (f32) -> ()
369  }
370  memref.dealloc %A : memref<256 x f32, affine_map<(d0) -> (d0)>, 0>
371  memref.dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1>
372  return
373}
374// No replacement.
375// CHECK: affine.for %{{.*}} = 0 to 8 {
376// CHECK-NEXT:   affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
377// CHECK-NEXT:   affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32>
378// CHECK-NEXT:   "compute"(%{{.*}}) : (memref<32xf32, 1>) -> ()
379// CHECK-NEXT:   [[VAL:%[0-9a-zA-Z_]+]] = affine.load %{{.*}}[%{{.*}}] : memref<32xf32, 1>
380// CHECK-NEXT:   "foo"([[VAL]]) : (f32) -> ()
381