xref: /llvm-project/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir (revision d58637219463924185614f18911c5f01a1c20aa9)
1// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s --check-prefixes=CHECK,PARALLEL-CHECK
2// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only check-parallel-regions=false" -split-input-file | FileCheck %s --check-prefixes=CHECK,NO-PARALLEL-CHECK
3
4// Run fuzzer with different seeds.
5// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
6// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
7// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
8
9// CHECK-LABEL: func @scf_for_yield_only
10func.func @scf_for_yield_only(
11    %A : tensor<?xf32> {bufferization.writable = false},
12    %B : tensor<?xf32> {bufferization.writable = true},
13    %lb : index,
14    %ub : index,
15    %step : index)
16  -> (tensor<?xf32>, tensor<?xf32>)
17{
18  //      CHECK: scf.for
19  // CHECK-NEXT: scf.yield
20  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
21  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
22  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
23    scf.yield %t : tensor<?xf32>
24  }
25
26  //      CHECK: scf.for
27  // CHECK-NEXT: scf.yield
28  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
29  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
30  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
31    scf.yield %t : tensor<?xf32>
32  }
33
34  //      CHECK: return
35  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
36  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
37}
38
39// -----
40
41// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
42func.func @scf_for_with_tensor.insert_slice(
43    %A : tensor<?xf32> {bufferization.writable = false},
44    %B : tensor<?xf32> {bufferization.writable = true},
45    %C : tensor<4xf32> {bufferization.writable = false},
46    %lb : index,
47    %ub : index,
48    %step : index)
49  -> (tensor<?xf32>, tensor<?xf32>)
50{
51  //      CHECK: scf.for
52  // scf.for bbArgs are always inplaceable seen from ops inside the body:
53  //   1. Either the matching tensor is not inplaceable and an alloc occurs
54  //      which makes bbArg inplaceable.
55  //   2. Or it is already inplaceable and so is bbArg.
56  // CHECK-NEXT:   tensor.insert_slice
57  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
58  // CHECK-NEXT:   tensor.insert_slice
59  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
60  // CHECK-NEXT:   scf.yield {__inplace_operands_attr__ = ["true", "true"]}
61  // CHECK-NEXT: } {__inplace_operands_attr__ = ["none", "none", "none", "false", "true"]}
62  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
63      -> (tensor<?xf32>, tensor<?xf32>)
64  {
65    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
66    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
67    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
68  }
69
70  //      CHECK: return
71  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
72  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
73}
74
75// -----
76
77func.func private @some_use(tensor<?xf32>) -> ()
78
79// CHECK-LABEL: func @scf_for_deps
80func.func @scf_for_deps(
81    %A : tensor<?xf32> {bufferization.writable = true},
82    %B : tensor<?xf32> {bufferization.writable = true},
83    %lb : index,
84    %ub : index,
85    %step : index)
86  -> (tensor<?xf32>)
87{
88  // %r0 must be out of place because one use of %t in the subsequent production
89  // of %r1 is read.
90  //      CHECK: scf.for
91  // CHECK-NEXT: call
92  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
93  // CHECK-NEXT: scf.yield
94  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
95  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
96  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
97    func.call @some_use(%t) : (tensor<?xf32>) -> ()
98    scf.yield %t : tensor<?xf32>
99  }
100
101  // %r1 bufferizes inplace fine.
102  //      CHECK: scf.for
103  // CHECK-NEXT: call
104  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
105  // CHECK-NEXT: scf.yield
106  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
107  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
108  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
109    func.call @some_use(%t) : (tensor<?xf32>) -> ()
110    scf.yield %t : tensor<?xf32>
111  }
112
113  //      CHECK: return
114  // CHECK-SAME: __equivalent_func_args__ = [0]
115  return %r1: tensor<?xf32>
116}
117
118// -----
119
120#accesses = [
121  affine_map<(i) -> (i)>
122]
123#trait = {
124  indexing_maps = #accesses,
125  iterator_types = ["parallel"]
126}
127
128// CHECK-LABEL: func @reading_scf_for
129func.func @reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
130                           %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
131
132  %c0 = arith.constant 0 : index
133  %c1 = arith.constant 1 : index
134  %cst = arith.constant 0.0 : f32
135
136  // Write to %t1.
137  // CHECK:      vector.transfer_write
138  // CHECK-SAME: __inplace_operands_attr__ = ["none", "false", "none"]
139  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
140
141  // Read the old value of %t1 inside the loop via an alias.
142  // CHECK: scf.for {{.*}} {
143  %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
144    // CHECK:      tensor.extract_slice
145    // CHECK-SAME: __inplace_operands_attr__ = ["true", "none", "none"]
146    %e = tensor.extract_slice %t2[%s][%s][1] : tensor<?xf32> to tensor<?xf32>
147
148    // Read from %t1 via alias %e.
149    %v2 = vector.transfer_read %e[%s], %cst : tensor<?xf32>, vector<5xf32>
150    scf.yield %t2, %v2 : tensor<?xf32>, vector<5xf32>
151  }
152  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true", "none"]}
153
154  // Use %t3 in some way without reading it, so that it does not get DCE'd.
155  // CHECK:      linalg.generic
156  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
157  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
158      ^bb(%0: f32) :
159        linalg.yield %cst : f32
160    } -> (tensor<?xf32>)
161
162  return %o, %v3 : tensor<?xf32>, vector<5xf32>
163}
164
165// -----
166
167#accesses = [
168  affine_map<(i) -> (i)>
169]
170#trait = {
171  indexing_maps = #accesses,
172  iterator_types = ["parallel"]
173}
174
175// CHECK-LABEL: func @non_reading_scf_for
176func.func @non_reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
177                               %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
178
179  %c0 = arith.constant 0 : index
180  %c1 = arith.constant 1 : index
181  %c10 = arith.constant 10 : index
182  %cst = arith.constant 0.0 : f32
183
184  // Write to %t1.
185  // CHECK:      vector.transfer_write
186  // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
187  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
188
189  // This loop does not read from %t1. It only writes to it.
190  // CHECK:      scf.for
191  %r, %v3 = scf.for %i = %c0 to %c10 step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
192    // Write to %t1 via %t2. (Overwrite %t3.)
193    // CHECK:      linalg.generic
194    // CHECK-SAME: __inplace_operands_attr__ = ["true"]
195    %o2 = linalg.generic #trait outs (%t2 : tensor<?xf32>) {
196        ^bb(%0: f32) :
197          linalg.yield %cst : f32
198      } -> (tensor<?xf32>)
199
200    // Read overwritten value. This is not a read of %t1.
201    %v2 = vector.transfer_read %o2[%s], %cst : tensor<?xf32>, vector<5xf32>
202    scf.yield %o2, %v2 : tensor<?xf32>, vector<5xf32>
203  }
204
205  // Use %t3 in some way without reading it, so that it does not get DCE'd.
206  // CHECK:      linalg.generic
207  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
208  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
209      ^bb(%0: f32) :
210        linalg.yield %cst : f32
211    } -> (tensor<?xf32>)
212
213  //      CHECK: return
214  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
215  return %o, %v3 : tensor<?xf32>, vector<5xf32>
216}
217
218// -----
219
220//===----------------------------------------------------------------------===//
221// scf.if cases
222//===----------------------------------------------------------------------===//
223
224// This example passes analysis, but it fails when bufferizing.
225// CHECK-LABEL: func @scf_if_inplace1
226func.func @scf_if_inplace1(%t1: tensor<?xf32> {bufferization.writable = true},
227                           %t2: tensor<?xf32> {bufferization.writable = true},
228                           %cond: i1) -> tensor<?xf32> {
229  %r = scf.if %cond -> (tensor<?xf32>) {
230    // CHECK:      scf.yield
231    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
232    scf.yield %t1 : tensor<?xf32>
233  } else {
234    // CHECK:      scf.yield
235    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
236    scf.yield %t2 : tensor<?xf32>
237  }
238  return %r : tensor<?xf32>
239}
240
241// -----
242
243// CHECK-LABEL: func @scf_if_inplace2
244func.func @scf_if_inplace2(%t1: tensor<?xf32> {bufferization.writable = true},
245                           %v: vector<5xf32>, %idx: index,
246                           %cond: i1) -> tensor<?xf32> {
247  %r = scf.if %cond -> (tensor<?xf32>) {
248    // CHECK:      scf.yield
249    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
250    scf.yield %t1 : tensor<?xf32>
251  } else {
252    //      CHECK: vector.transfer_write
253    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
254    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
255    scf.yield %t2 : tensor<?xf32>
256  }
257  //      CHECK: return
258  // CHECK-SAME: __equivalent_func_args__ = [0]
259  return %r : tensor<?xf32>
260}
261
262// -----
263
264// CHECK-LABEL: func @scf_if_inplace3
265func.func @scf_if_inplace3(%t1: tensor<?xf32> {bufferization.writable = true},
266                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
267                           %cond: i1) -> tensor<?xf32> {
268  //      CHECK: tensor.extract_slice
269  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
270  %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
271  %r = scf.if %cond -> (tensor<?xf32>) {
272    //      CHECK: vector.transfer_write
273    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
274    %t2 = vector.transfer_write %v1, %e[%idx] : vector<5xf32>, tensor<?xf32>
275    //      CHECK: scf.yield
276    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
277    scf.yield %t2 : tensor<?xf32>
278  } else {
279    // Writing the same tensor through an alias. This is OK.
280    //      CHECK: vector.transfer_write
281    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
282    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
283    //      CHECK: scf.yield
284    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
285    scf.yield %t3 : tensor<?xf32>
286  }
287  return %r : tensor<?xf32>
288}
289
290// -----
291
292// CHECK-LABEL: func @scf_if_in_place4
293func.func @scf_if_in_place4(%t1: tensor<?xf32> {bufferization.writable = true},
294                            %v: vector<5xf32>, %idx: index,
295                            %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
296  %cst = arith.constant 0.0 : f32
297  %r = scf.if %cond -> (tensor<?xf32>) {
298    //      CHECK: scf.yield
299    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
300    scf.yield %t1 : tensor<?xf32>
301  } else {
302    //      CHECK: vector.transfer_write
303    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
304    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
305    //      CHECK: scf.yield
306    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
307    scf.yield %t2 : tensor<?xf32>
308  }
309  %r_alias = scf.if %cond2 -> (tensor<?xf32>) {
310    // Reading %r is OK. No conflict.
311    //      CHECK: scf.yield
312    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
313    scf.yield %r : tensor<?xf32>
314  } else {
315    //      CHECK: scf.yield
316    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
317    scf.yield %r : tensor<?xf32>
318  }
319  %v2 = vector.transfer_read %r_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
320
321  //      CHECK: return
322  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
323  return %r_alias, %v2 : tensor<?xf32>, vector<10xf32>
324}
325
326// -----
327
328// CHECK-LABEL: func @scf_if_inplace5
329func.func @scf_if_inplace5(%t1: tensor<?xf32> {bufferization.writable = true},
330                           %idx: index, %cond: i1) -> tensor<?xf32> {
331  %r = scf.if %cond -> (tensor<?xf32>) {
332    //      CHECK: tensor.extract_slice
333    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
334    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
335    //      CHECK: scf.yield
336    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
337    scf.yield %e : tensor<?xf32>
338  } else {
339    //      CHECK: tensor.extract_slice
340    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
341    %f = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
342    //      CHECK: scf.yield
343    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
344    scf.yield %f : tensor<?xf32>
345  }
346
347  // Inserting into an equivalent tensor at the same offset. This bufferizes
348  // inplace.
349  //      CHECK: tensor.insert_slice
350  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
351  %r2 = tensor.insert_slice %r into %t1[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32>
352
353  //      CHECK: return
354  // CHECK-SAME: __equivalent_func_args__ = [0]
355  return %r2 : tensor<?xf32>
356}
357
358// -----
359
360// CHECK-LABEL: func @scf_if_inplace6
361func.func @scf_if_inplace6(%t1: tensor<?xf32> {bufferization.writable = true},
362                           %v1: vector<5xf32>, %v2: vector<5xf32>,
363                           %v3: vector<5xf32>, %idx: index,
364                           %cond: i1, %cond2: i1) -> tensor<?xf32> {
365  // Test nested scf.if ops.
366  %r = scf.if %cond -> (tensor<?xf32>) {
367    %t2 = scf.if %cond2 -> (tensor<?xf32>) {
368      //      CHECK: vector.transfer_write
369      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
370      %t3 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
371      //      CHECK: scf.yield
372      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
373      scf.yield %t3 : tensor<?xf32>
374    } else {
375      //      CHECK: vector.transfer_write
376      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
377      %t4 = vector.transfer_write %v3, %t1[%idx] : vector<5xf32>, tensor<?xf32>
378      //      CHECK: scf.yield
379      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
380      scf.yield %t4 : tensor<?xf32>
381    }
382    //      CHECK: scf.yield
383    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
384    scf.yield %t2 : tensor<?xf32>
385  } else {
386    //      CHECK: vector.transfer_write
387    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
388    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
389    //      CHECK: scf.yield
390    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
391    scf.yield %t3 : tensor<?xf32>
392  }
393
394  //      CHECK: return
395  // CHECK-SAME: __equivalent_func_args__ = [0]
396  return %r : tensor<?xf32>
397}
398
399// -----
400
401// CHECK-LABEL: func @scf_if_inplace7
402func.func @scf_if_inplace7(%t1: tensor<?xf32> {bufferization.writable = true},
403                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
404                           %idx2: index, %cond: i1) -> (tensor<?xf32>, vector<5xf32>) {
405  %cst = arith.constant 0.0 : f32
406  %r, %v_r2 = scf.if %cond -> (tensor<?xf32>, vector<5xf32>) {
407    //      CHECK: vector.transfer_write
408    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
409    %t2 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
410    //      CHECK: scf.yield
411    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
412    scf.yield %t2, %v1 : tensor<?xf32>, vector<5xf32>
413  } else {
414    // Writing the same tensor through an alias.
415    //      CHECK: vector.transfer_write
416    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
417    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
418    // Read the original value of %t1. This requires the write in this branch
419    // to be out-of-place. But the write in the other branch can still be
420    // inplace.
421    %v_r = vector.transfer_read %t1[%idx2], %cst : tensor<?xf32>, vector<5xf32>
422    //      CHECK: scf.yield
423    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
424    scf.yield %t3, %v_r : tensor<?xf32>, vector<5xf32>
425  }
426  return %r, %v_r2 : tensor<?xf32>, vector<5xf32>
427}
428
429// -----
430
431// CHECK-LABEL: func @scf_if_out_of_place1a
432func.func @scf_if_out_of_place1a(%t1: tensor<?xf32> {bufferization.writable = true},
433                                 %idx: index, %idx2: index,
434                                 %cond: i1) -> tensor<?xf32> {
435  %r = scf.if %cond -> (tensor<?xf32>) {
436    //      CHECK: tensor.extract_slice
437    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
438    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
439    //      CHECK: scf.yield
440    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
441    scf.yield %e : tensor<?xf32>
442  } else {
443    //      CHECK: scf.yield
444    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
445    scf.yield %t1 : tensor<?xf32>
446  }
447
448  // Reading from and writing to the same tensor via different args. This is a
449  // conflict.
450  //      CHECK: tensor.insert_slice
451  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"]
452  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
453  return %r2 : tensor<?xf32>
454}
455
456// -----
457
458// CHECK-LABEL: func @scf_if_out_of_place1b
459func.func @scf_if_out_of_place1b(%t1: tensor<?xf32> {bufferization.writable = true},
460                                 %idx: index, %idx2: index, %idx3: index,
461                                 %cond: i1) -> tensor<?xf32> {
462  %r = scf.if %cond -> (tensor<?xf32>) {
463    //      CHECK: tensor.extract_slice
464    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
465    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
466    //      CHECK: scf.yield
467    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
468    scf.yield %e : tensor<?xf32>
469  } else {
470    //      CHECK: tensor.extract_slice
471    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
472    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
473    //      CHECK: scf.yield
474    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
475    scf.yield %f : tensor<?xf32>
476  }
477
478  // Reading from and writing to the same tensor via different args. This is a
479  // conflict. In contrast to scf_if_out_of_place1a, the fact that %r aliases
480  // with %t1 is only detected when analyzing the tensor.extract_slices. That's
481  // why the tensor.insert_slice is inplace and the two extract_slices are
482  // out-of-place.
483  //      CHECK: tensor.insert_slice
484  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
485  %r2 = tensor.insert_slice %r into %t1[%idx3][%idx3][1] : tensor<?xf32> into tensor<?xf32>
486
487  //      CHECK: return
488  // CHECK-SAME: __equivalent_func_args__ = [0]
489  return %r2 : tensor<?xf32>
490}
491
492// -----
493
494// CHECK-LABEL: func @scf_if_out_of_place1c
495func.func @scf_if_out_of_place1c(%t1: tensor<?xf32> {bufferization.writable = true},
496                                 %idx: index, %idx2: index, %cond: i1) -> tensor<?xf32> {
497  %r = scf.if %cond -> (tensor<?xf32>) {
498    //      CHECK: tensor.extract_slice
499    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
500    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
501    //      CHECK: scf.yield
502    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
503    scf.yield %e : tensor<?xf32>
504  } else {
505    // TODO: This one could bufferize inplace, but the analysis is too restrictive.
506    //      CHECK: tensor.extract_slice
507    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
508    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
509    //      CHECK: scf.yield
510    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
511    scf.yield %f : tensor<?xf32>
512  }
513
514  //      CHECK: tensor.insert_slice
515  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
516  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
517
518  //      CHECK: return
519  // CHECK-SAME: __equivalent_func_args__ = [0]
520  return %r2 : tensor<?xf32>
521}
522
523// -----
524
525// CHECK-LABEL: func @scf_if_out_of_place2
526func.func @scf_if_out_of_place2(%t1: tensor<?xf32> {bufferization.writable = true},
527                                %v: vector<5xf32>, %idx: index,
528                                %cond: i1) -> (tensor<?xf32>, vector<10xf32>) {
529  %cst = arith.constant 0.0 : f32
530  %r = scf.if %cond -> (tensor<?xf32>) {
531    scf.yield %t1 : tensor<?xf32>
532  } else {
533    //      CHECK: vector.transfer_write
534    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
535    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
536    //      CHECK: scf.yield
537    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
538    scf.yield %t2 : tensor<?xf32>
539  }
540
541  // Read the old value of %t1. Forces the transfer_write to bufferize
542  // out-of-place.
543  %v2 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<10xf32>
544  return %r, %v2 : tensor<?xf32>, vector<10xf32>
545}
546
547// -----
548
549// CHECK-LABEL: func @scf_if_out_of_place3
550func.func @scf_if_out_of_place3(%t1: tensor<?xf32> {bufferization.writable = true},
551                                %v: vector<5xf32>, %idx: index,
552                                %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
553  %cst = arith.constant 0.0 : f32
554  %r = scf.if %cond -> (tensor<?xf32>) {
555    scf.yield %t1 : tensor<?xf32>
556  } else {
557    //      CHECK: vector.transfer_write
558    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
559    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
560    //      CHECK: scf.yield
561    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
562    scf.yield %t2 : tensor<?xf32>
563  }
564  %t1_alias = scf.if %cond2 -> (tensor<?xf32>) {
565    // scf.yield bufferizes to a read. That is a conflict in this example.
566    //      CHECK: scf.yield
567    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
568    scf.yield %t1 : tensor<?xf32>
569  } else {
570    //      CHECK: scf.yield
571    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
572    scf.yield %t1 : tensor<?xf32>
573  }
574  %v2 = vector.transfer_read %t1_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
575  return %r, %v2 : tensor<?xf32>, vector<10xf32>
576}
577
578// -----
579
580// CHECK-LABEL: func @write_to_same_tensor_in_loop_in_place(
581func.func @write_to_same_tensor_in_loop_in_place(
582    %A : tensor<?xf32> {bufferization.writable = true},
583    %lb : index, %ub : index, %step : index, %sz: index)
584  -> (tensor<?xf32>)
585{
586  // CHECK: scf.for {{.*}} {
587  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
588    %B = bufferization.alloc_tensor(%sz) : tensor<?xf32>
589    %i2 = arith.index_cast %i : index to i32
590    %i3 = arith.sitofp %i2 : i32 to f32
591    // The tensor.insert is in-place because the %B is defined inside the loop.
592    //      CHECK: tensor.insert
593    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
594    %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32>
595    //      CHECK: tensor.insert_slice
596    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
597    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
598    scf.yield %A2 : tensor<?xf32>
599  }
600  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
601
602  return %r0 : tensor<?xf32>
603}
604
605// -----
606
607// This is a regression test. Everything can bufferize in-place because %7 and
608// %arg1 are in the same repetitive region.
609
610// CHECK-LABEL: func @same_enclosing_repetitive_region
611func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
612                                            %3: tensor<320x10240xf32>)
613  -> tensor<320xf32>
614{
615  %c0 = arith.constant 0 : index
616  %cst = arith.constant -0.000000e+00 : f32
617  %c320 = arith.constant 320 : index
618  %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
619    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
620    %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
621    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
622    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
623    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
624    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
625    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
626    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
627
628    scf.forall.in_parallel {
629      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
630      tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
631    }
632  }
633  return %4 : tensor<320xf32>
634}
635
636// -----
637
638// CHECK-LABEL: different_repetitive_region_via_alias
639func.func @different_repetitive_region_via_alias(%arg0: tensor<4xf32>,
640                                                 %arg1: tensor<4xf32>,
641                                                 %arg2: index,
642                                                 %arg3: index,
643                                                 %arg4: index)
644  -> (tensor<4xf32>)
645{
646  %cst = arith.constant 0.000000e+00 : f32
647  %cst2 = arith.constant 1.000000e+00 : f32
648  %0 = bufferization.alloc_tensor() : tensor<4xf32>
649
650  // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
651  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
652
653  %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) {
654    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
655    %4 = tensor.extract %1[%arg4] : tensor<4xf32>
656    vector.print %4 : f32
657    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
658    %5 = linalg.fill ins(%cst2 : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
659    scf.yield %5 : tensor<4xf32>
660  }
661
662  return %2 : tensor<4xf32>
663}
664
665// -----
666
667// CHECK-LABEL: no_raw_conflict_after_repetitive_use
668func.func @no_raw_conflict_after_repetitive_use(%arg0: tensor<4xf32>,
669                                                %arg1: tensor<4xf32>,
670                                                %arg2: index,
671                                                %arg3: index,
672                                                %arg4: index)
673  -> (tensor<4xf32>, tensor<4xf32>)
674{
675  %cst = arith.constant 0.000000e+00 : f32
676  %cst2 = arith.constant 1.000000e+00 : f32
677  %cst3 = arith.constant 2.000000e+00 : f32
678  %0 = bufferization.alloc_tensor() : tensor<4xf32>
679
680  // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
681  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32>
682
683  %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) {
684    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
685    %4 = tensor.extract %1[%arg4] : tensor<4xf32>
686    vector.print %4 : f32
687    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
688    %5 = linalg.fill ins(%cst2 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32>
689    scf.yield %5 : tensor<4xf32>
690  }
691
692  // The following is *not* a RaW conflict.
693  // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
694  %6 = tensor.extract %1[%arg4] : tensor<4xf32>
695  vector.print %6 : f32
696  // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
697  %7 = linalg.fill ins(%cst3 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32>
698
699  return %2, %7 : tensor<4xf32>, tensor<4xf32>
700}
701
702// -----
703
704// CHECK-LABEL: func @read_of_bbarg_in_repetitive_region(
705func.func @read_of_bbarg_in_repetitive_region(
706    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
707  // CHECK: scf.for
708  scf.for %iv = %a to %b step %c {
709    // Must bufferize out-of-place because definition of read is in a different
710    // repetitive region.
711    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
712    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
713    %3 = tensor.extract %2[%a] : tensor<4xf32>
714    vector.print %3 : f32
715    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
716    %4 = tensor.insert %cst into %2[%a] : tensor<4xf32>
717    %5 = tensor.extract %4[%a] : tensor<4xf32>
718    vector.print %5 : f32
719  }
720  return
721}
722
723// -----
724
725// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_write(
726func.func @read_definition_in_same_repetitive_region_as_write(
727    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
728  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
729  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
730  // CHECK: scf.for
731  scf.for %iv = %a to %b step %c {
732    // Can bufferize in-place.
733    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
734    %2 = tensor.extract_slice %1[0][4][1] : tensor<10xf32> to tensor<4xf32>
735    %3 = tensor.extract %2[%a] : tensor<4xf32>
736    vector.print %3 : f32
737  }
738  return
739}
740
741// -----
742
743// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_conflicting_write(
744func.func @read_definition_in_same_repetitive_region_as_conflicting_write(
745    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
746  // Cannot bufferize in-place according to normal op dominance rules.
747  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
748  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
749  // CHECK: scf.for
750  scf.for %iv = %a to %b step %c {
751    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
752    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
753    %3 = tensor.extract %2[%a] : tensor<4xf32>
754    vector.print %3 : f32
755  }
756  return
757}
758
759// -----
760
761// CHECK: func @write_value_in_repetitive_region(
762func.func @write_value_in_repetitive_region(
763    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
764  %0 = tensor.extract %t[%a] : tensor<10xf32>
765  vector.print %0 : f32
766
767  scf.for %iv = %a to %b step %c {
768    // No further read of %0, so this can bufferize in-place.
769    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
770    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
771    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
772    %filled = linalg.fill ins(%cst : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32>
773    %3 = tensor.extract %filled[%a] : tensor<4xf32>
774    vector.print %3 : f32
775  }
776  return
777}
778
779// -----
780
781// CHECK-LABEL: func @nesting_op_repetitive_regions(
782func.func @nesting_op_repetitive_regions(
783    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
784  // Cannot bufferize in-place according to normal op dominance rules.
785  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
786  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
787  // CHECK: scf.for
788  scf.for %iv1 = %a to %b step %c {
789    // CHECK: scf.for
790    scf.for %iv2 = %a to %b step %c {
791      // CHECK: scf.for
792      scf.for %iv3 = %a to %b step %c {
793        // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
794        %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
795        %3 = tensor.extract %2[%a] : tensor<4xf32>
796        vector.print %3 : f32
797      }
798    }
799  }
800  return
801}
802
803// -----
804
805// CHECK-LABEL: func @parallel_region()
806func.func @parallel_region() -> tensor<320xf32>
807{
808  %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
809  %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
810  %c320 = arith.constant 320 : index
811  // CHECK: scf.forall
812  %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
813    %val = "test.foo"() : () -> (f32)
814    // linalg.fill must bufferize out-of-place because every thread needs a
815    // private copy of %alloc1. If not accounting for parallel regions, the fill
816    // can bufferize in place.
817    // PARALLEL-CHECK:    linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
818    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
819    %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
820    scf.forall.in_parallel {
821      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
822      tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
823    }
824  }
825  // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
826  return %0 : tensor<320xf32>
827}
828
829// -----
830
831// CHECK-LABEL: func @parallel_region_mixed_def(
832func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32>
833{
834  %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
835  %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
836  %c320 = arith.constant 320 : index
837  // CHECK: scf.forall
838  %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
839    %alloc2 = bufferization.alloc_tensor() : tensor<1xf32>
840    %selected = scf.if %c -> tensor<1xf32> {
841      scf.yield %alloc1 : tensor<1xf32>
842    } else {
843      scf.yield %alloc2 : tensor<1xf32>
844    }
845    %val = "test.foo"() : () -> (f32)
846    // linalg.fill must bufferize out-of-place because every thread needs a
847    // private copy of %alloc1. If not accounting for parallel regions, the fill
848    // can bufferize in place.
849    // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
850    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
851    %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32>
852    scf.forall.in_parallel {
853      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
854      tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
855    }
856  }
857  // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
858  return %0 : tensor<320xf32>
859}
860
861// -----
862
863// CHECK-LABEL: func @parallel_region_two_writes(
864func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32>
865{
866  %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
867  %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
868  %c320 = arith.constant 320 : index
869  %c0 = arith.constant 0 : index
870  // CHECK: scf.forall
871  %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
872    %val = "test.foo"() : () -> (f32)
873    // linalg.fill must bufferize out-of-place because every thread needs a
874    // private copy of %alloc1. If not accounting for parallel regions, the fill
875    // can bufferize in place.
876    // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
877    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
878    %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
879    // CHECK: tensor.insert
880    // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
881    %inserted = tensor.insert %f into %fill[%c0] : tensor<1xf32>
882
883    scf.forall.in_parallel {
884      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
885      tensor.parallel_insert_slice %inserted into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
886    }
887  }
888  // CHECK: } {__inplace_operands_attr__ = ["none", "true"]}
889  return %0 : tensor<320xf32>
890}
891
892// -----
893
894// CHECK-LABEL: func @parallel_region_no_read()
895func.func @parallel_region_no_read()
896{
897  %alloc0 = bufferization.alloc_tensor() : tensor<320xf32>
898  %alloc1 = bufferization.alloc_tensor() : tensor<1xf32>
899  %c320 = arith.constant 320 : index
900  // CHECK: scf.forall
901  scf.forall (%arg0) in (%c320) {
902    %val = "test.foo"() : () -> (f32)
903    // linalg.fill can bufferize in-place because no alias of %alloc1 is read.
904    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
905    %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
906    scf.forall.in_parallel {
907    }
908  }
909  return
910}
911