xref: /llvm-project/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir (revision 1004865f1ca41a9581da8747f34b29862d3ebc3d)
1// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf))" -split-input-file -allow-unregistered-dialect | FileCheck %s
2// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true lower-scalable=true}))" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL
3// RUN: mlir-opt %s "-convert-vector-to-scf=full-unroll target-rank=0" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=TARGET-RANK-ZERO
4
5// CHECK-LABEL: func @vector_transfer_ops_0d(
6func.func @vector_transfer_ops_0d(%M: memref<f32>) {
7  %f0 = arith.constant 0.0 : f32
8
9  // 0-d transfers are left untouched by vector-to-scf.
10  // They are independently lowered to the proper memref.load/store.
11  //  CHECK: vector.transfer_read {{.*}}: memref<f32>, vector<f32>
12  %0 = vector.transfer_read %M[], %f0 {permutation_map = affine_map<()->()>} :
13    memref<f32>, vector<f32>
14
15  //  CHECK: vector.transfer_write {{.*}}: vector<f32>, memref<f32>
16  vector.transfer_write %0, %M[] {permutation_map = affine_map<()->()>} :
17    vector<f32>, memref<f32>
18
19  return
20}
21
22// -----
23
24// CHECK-LABEL: func @materialize_read_1d() {
25func.func @materialize_read_1d() {
26  %f0 = arith.constant 0.0: f32
27  %A = memref.alloc () : memref<7x42xf32>
28  affine.for %i0 = 0 to 7 step 4 {
29    affine.for %i1 = 0 to 42 step 4 {
30      %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
31      %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1)
32      %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
33      %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1)
34      %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
35      %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
36      %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
37      // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
38      // CHECK: scf.if
39      // CHECK-NEXT: memref.load
40      // CHECK-NEXT: vector.insertelement
41      // CHECK-NEXT: scf.yield
42      // CHECK-NEXT: else
43      // CHECK-NEXT: scf.yield
44      // Add a dummy use to prevent dead code elimination from removing transfer
45      // read ops.
46      "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> ()
47    }
48  }
49  return
50}
51
52// -----
53
54// CHECK-LABEL: func @materialize_read_1d_partially_specialized
55func.func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
56  %f0 = arith.constant 0.0: f32
57  %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
58  affine.for %i0 = 0 to 7 {
59    affine.for %i1 = 0 to %dyn1 {
60      affine.for %i2 = 0 to %dyn2 {
61        affine.for %i3 = 0 to 42 step 2 {
62          affine.for %i4 = 0 to %dyn4 {
63            %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
64            %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3)
65            %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
66            // Add a dummy use to prevent dead code elimination from removing
67            // transfer read ops.
68            "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> ()
69          }
70        }
71      }
72    }
73  }
74  // CHECK: %[[tensor:[0-9a-zA-Z_]+]] = memref.alloc
75  // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0
76  // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3
77  return
78}
79
80// -----
81
82// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
83
84// CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
85func.func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
86  %f0 = arith.constant 0.0: f32
87  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
88  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
89  // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
90  // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
91  // CHECK-DAG:  %[[C5:.*]] = arith.constant 5 : index
92  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
93  // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
94  // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
95  // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
96  // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
97  // CHECK:               %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
98  // CHECK:               scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
99  // CHECK:                 scf.if
100  // CHECK:                   %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
101  // CHECK:                   scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
102  // CHECK:                     %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) {
103  // CHECK:                       %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
104  // CHECK:                       scf.if {{.*}} -> (vector<3xf32>) {
105  // CHECK-NEXT:                    %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
106  // CHECK-NEXT:                    %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[I6]] : index] : vector<3xf32>
107  // CHECK-NEXT:                    scf.yield
108  // CHECK-NEXT:                  } else {
109  // CHECK-NEXT:                    scf.yield
110  // CHECK-NEXT:                  }
111  // CHECK-NEXT:                  scf.yield
112  // CHECK-NEXT:                }
113  // CHECK-NEXT:                memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>>
114  // CHECK-NEXT:              }
115  // CHECK-NEXT:            } else {
116  // CHECK-NEXT:              memref.store {{.*}} : memref<5xvector<4x3xf32>>
117  // CHECK-NEXT:            }
118  // CHECK-NEXT:          }
119  // CHECK-NEXT:          %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>>
120  // CHECK-NEXT:          "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> ()
121  // CHECK-NEXT:        }
122  // CHECK-NEXT:      }
123  // CHECK-NEXT:    }
124  // CHECK-NEXT:  }
125  // CHECK-NEXT:  return
126  // CHECK-NEXT:}
127
128  // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4)
129  // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...)
130  // Other dimensions are just accessed with I1, I2 resp.
131  %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
132  affine.for %i0 = 0 to %M step 3 {
133    affine.for %i1 = 0 to %N {
134      affine.for %i2 = 0 to %O {
135        affine.for %i3 = 0 to %P step 5 {
136          %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
137          // Add a dummy use to prevent dead code elimination from removing
138          // transfer read ops.
139          "dummy_use"(%f) : (vector<5x4x3xf32>) -> ()
140        }
141      }
142    }
143  }
144  return
145}
146
147// -----
148
149// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
150
151// CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
152func.func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
153  // CHECK-DAG:  %{{.*}} = arith.constant dense<1.000000e+00> : vector<3x4x1x5xf32>
154  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
155  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
156  // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
157  // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
158  // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
159  // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
160  // CHECK-NEXT:   affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
161  // CHECK-NEXT:     affine.for %[[I2:.*]] = 0 to %{{.*}} {
162  // CHECK-NEXT:       affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
163  // CHECK:              %[[ALLOC:.*]] = memref.alloca() : memref<vector<3x4x1x5xf32>>
164  // CHECK:              memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<3x4x1x5xf32>>
165  // CHECK:              %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<3x4x1x5xf32>> to memref<3xvector<4x1x5xf32>>
166  // CHECK:              scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
167  // CHECK:                scf.if
168  // CHECK:                  %[[S3:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
169  // CHECK:                  %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<3xvector<4x1x5xf32>> to memref<3x4xvector<1x5xf32>>
170  // CHECK:                  scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
171  // CHECK:                    scf.if
172  // CHECK:                      %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
173  // CHECK:                      %[[VECTOR_VIEW3:.*]] = vector.type_cast %[[VECTOR_VIEW2]] : memref<3x4xvector<1x5xf32>> to memref<3x4x1xvector<5xf32>>
174  // CHECK:                      scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
175  // CHECK:                        %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]])
176  // CHECK:                        %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>>
177  // CHECK:                        vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref<?x?x?x?xf32>
178  // CHECK:                      }
179  // CHECK:                    }
180  // CHECK:                  }
181  // CHECK:                }
182  // CHECK:              }
183  // CHECK:            }
184  // CHECK:          }
185  // CHECK:        }
186  // CHECK:      }
187  // CHECK:      return
188
189  // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
190  // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
191  // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
192  // Other dimension is just accessed with I2.
193  %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
194  %f1 = arith.constant dense<1.000000e+00> : vector<5x4x3xf32>
195  affine.for %i0 = 0 to %M step 3 {
196    affine.for %i1 = 0 to %N step 4 {
197      affine.for %i2 = 0 to %O {
198        affine.for %i3 = 0 to %P step 5 {
199          vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
200        }
201      }
202    }
203  }
204  return
205}
206
207// -----
208
209// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
210
211// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
212// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
213
214
215// CHECK-LABEL: transfer_read_progressive(
216//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
217//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index
218
219// FULL-UNROLL-LABEL: transfer_read_progressive(
220//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
221//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index
222
223func.func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x15xf32> {
224  %f7 = arith.constant 7.0: f32
225  // CHECK-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
226  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
227  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
228  // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
229  // CHECK-DAG: %[[splat:.*]] = arith.constant dense<7.000000e+00> : vector<15xf32>
230  // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
231  // CHECK:     %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
232  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
233  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
234  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
235  // CHECK:       %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
236  // CHECK:       scf.if %[[cond1]] {
237  // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
238  // CHECK:         memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
239  // CHECK:       } else {
240  // CHECK:         store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
241  // CHECK:       }
242  // CHECK:     }
243  // CHECK:     %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>>
244
245  // FULL-UNROLL-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
246  // FULL-UNROLL-DAG: %[[VEC0:.*]] = arith.constant dense<7.000000e+00> : vector<3x15xf32>
247  // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
248  // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
249  // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
250  // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
251  // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
252  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
253  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
254  // FULL-UNROLL: } else {
255  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
256  // FULL-UNROLL: }
257  // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
258  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
259  // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
260  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
261  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
262  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
263  // FULL-UNROLL: } else {
264  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
265  // FULL-UNROLL: }
266  // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
267  // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
268  // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
269  // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
270  // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
271  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
272  // FULL-UNROLL: } else {
273  // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
274  // FULL-UNROLL: }
275
276  %f = vector.transfer_read %A[%base, %base], %f7 :
277    memref<?x?xf32>, vector<3x15xf32>
278
279  return %f: vector<3x15xf32>
280}
281
282// -----
283
284// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
285
286// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
287// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
288
289// CHECK-LABEL: transfer_write_progressive(
290//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
291//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
292//  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
293// FULL-UNROLL-LABEL: transfer_write_progressive(
294//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
295//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
296//  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
297func.func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
298  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
299  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
300  // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
301  // CHECK:     %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
302  // CHECK:     memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
303  // CHECK:     %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
304  // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
305  // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
306  // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
307  // CHECK:       %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
308  // CHECK:       scf.if %[[cmp]] {
309  // CHECK:         %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
310  // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
311  // CHECK:       }
312  // CHECK:     }
313
314  // FULL-UNROLL: %[[C0:.*]] = arith.constant 0 : index
315  // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
316  // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index
317  // FULL-UNROLL: scf.if %[[CMP0]] {
318  // FULL-UNROLL:   %[[V0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
319  // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
320  // FULL-UNROLL: }
321  // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
322  // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index
323  // FULL-UNROLL: scf.if %[[CMP1]] {
324  // FULL-UNROLL:   %[[V1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
325  // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
326  // FULL-UNROLL: }
327  // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
328  // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index
329  // FULL-UNROLL: scf.if %[[CMP2]] {
330  // FULL-UNROLL:   %[[V2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
331  // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
332  // FULL-UNROLL: }
333
334  vector.transfer_write %vec, %A[%base, %base] :
335    vector<3x15xf32>, memref<?x?xf32>
336  return
337}
338
339// -----
340
341// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
342
343// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
344// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
345
346// CHECK-LABEL: transfer_write_progressive_inbounds(
347//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
348//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
349//  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
350// FULL-UNROLL-LABEL: transfer_write_progressive_inbounds(
351//  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
352//  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
353//  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
354func.func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
355  // CHECK-NOT:    scf.if
356  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
357  // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
358  // CHECK:      %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
359  // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
360  // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
361  // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
362  // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
363  // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
364  // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
365
366  // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
367  // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
368  // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
369  // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
370  // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
371  // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
372  // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
373  // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
374  vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} :
375    vector<3x15xf32>, memref<?x?xf32>
376  return
377}
378
379// -----
380
381// FULL-UNROLL-LABEL: transfer_read_simple
382func.func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> {
383  %c0 = arith.constant 0 : index
384  %f0 = arith.constant 0.0 : f32
385  // FULL-UNROLL-DAG: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
386  // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
387  // FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index
388  // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]]
389  // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32>
390  // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]]
391  // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32>
392  %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
393  return %0 : vector<2x2xf32>
394}
395
396func.func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32> {
397  %c0 = arith.constant 0 : index
398  %f0 = arith.constant 0.0 : f32
399  %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0
400    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
401      : memref<?x?x?x?xf32>, vector<3x3xf32>
402  return %0 : vector<3x3xf32>
403}
404
405// CHECK-LABEL: transfer_read_minor_identity(
406//  CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
407//  CHECK-DAG:    %[[c0:.*]] = arith.constant 0 : index
408//  CHECK-DAG:    %[[c1:.*]] = arith.constant 1 : index
409//  CHECK-DAG:    %[[c2:.*]] = arith.constant 2 : index
410//  CHECK-DAG:    %[[c3:.*]] = arith.constant 3 : index
411//  CHECK-DAG:    %[[f0:.*]] = arith.constant 0.000000e+00 : f32
412//  CHECK-DAG:    %[[cst0:.*]] = arith.constant dense<0.000000e+00> : vector<3xf32>
413//  CHECK:        %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
414//  CHECK:        %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
415//  CHECK:        scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]]
416//  CHECK:          %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
417//  CHECK:          %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index
418//  CHECK:          scf.if %[[cmp]] {
419//  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
420//  CHECK:            memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
421//  CHECK:          } else {
422//  CHECK:            memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
423//  CHECK:          }
424//  CHECK:        }
425//  CHECK:        %[[ret:.*]]  = memref.load %[[m]][] : memref<vector<3x3xf32>>
426//  CHECK:        return %[[ret]] : vector<3x3xf32>
427
428func.func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) {
429  %c0 = arith.constant 0 : index
430  %f0 = arith.constant 0.0 : f32
431  vector.transfer_write %A, %B[%c0, %c0, %c0, %c0]
432    { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
433      : vector<3x3xf32>, memref<?x?x?x?xf32>
434  return
435}
436
437// CHECK-LABEL: transfer_write_minor_identity(
438// CHECK-SAME:      %[[A:.*]]: vector<3x3xf32>,
439// CHECK-SAME:      %[[B:.*]]: memref<?x?x?x?xf32>)
440// CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
441// CHECK-DAG:     %[[c1:.*]] = arith.constant 1 : index
442// CHECK-DAG:     %[[c2:.*]] = arith.constant 2 : index
443// CHECK-DAG:     %[[c3:.*]] = arith.constant 3 : index
444// CHECK:         %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
445// CHECK:         memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>>
446// CHECK:         %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
447// CHECK:         scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]]
448// CHECK:           %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
449// CHECK:           %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index
450// CHECK:           scf.if %[[cmp]] {
451// CHECK:             %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
452// CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
453// CHECK:           }
454// CHECK:         }
455// CHECK:         return
456
457
458// -----
459
460func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> {
461  %c0 = arith.constant 0 : index
462  %f0 = arith.constant 0.0 : f32
463  %0 = vector.transfer_read %A[%c0, %c0], %f0
464      : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32>
465  return %0 : vector<4xf32>
466}
467
468// CHECK-LABEL: transfer_read_strided(
469// CHECK: scf.for
470// CHECK: memref.load
471
472func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) {
473  %c0 = arith.constant 0 : index
474  vector.transfer_write %A, %B[%c0, %c0] :
475    vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>
476  return
477}
478
479// CHECK-LABEL: transfer_write_strided(
480// CHECK: scf.for
481// CHECK: store
482
483// -----
484
485func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> ()
486
487// CHECK-LABEL: transfer_read_within_async_execute
488func.func @transfer_read_within_async_execute(%A : memref<2x2xf32>) -> !async.token {
489  %c0 = arith.constant 0 : index
490  %f0 = arith.constant 0.0 : f32
491  // CHECK-NOT: alloca
492  //     CHECK: async.execute
493  //     CHECK:   alloca
494  %token = async.execute {
495    %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
496    func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> ()
497    async.yield
498  }
499  return %token : !async.token
500}
501
502// -----
503
504// CHECK-LABEL: transfer_read_with_tensor
505func.func @transfer_read_with_tensor(%arg: tensor<f32>) -> vector<1xf32> {
506    // CHECK:      %[[EXTRACTED:.*]] = vector.transfer_read %{{.*}}[], %{{.*}} : tensor<f32>, vector<f32>
507    // CHECK-NEXT: %[[RESULT:.*]] = vector.broadcast %[[EXTRACTED]] : vector<f32> to vector<1xf32>
508    // CHECK-NEXT: return %[[RESULT]] : vector<1xf32>
509    %f0 = arith.constant 0.0 : f32
510    %0 = vector.transfer_read %arg[], %f0 {permutation_map = affine_map<()->(0)>} :
511      tensor<f32>, vector<1xf32>
512    return %0: vector<1xf32>
513}
514
515// -----
516
517// CHECK-LABEL: transfer_write_scalable
518func.func @transfer_write_scalable(%arg0: memref<?xf32, strided<[?], offset: ?>>, %arg1: f32) {
519  %0 = llvm.mlir.constant(0 : i32) : i32
520  %c0 = arith.constant 0 : index
521  %dim = memref.dim %arg0, %c0 : memref<?xf32, strided<[?], offset: ?>>
522  %1 = llvm.intr.stepvector : vector<[16]xi32>
523  %2 = arith.index_cast %dim : index to i32
524  %3 = llvm.mlir.undef : vector<[16]xi32>
525  %4 = llvm.insertelement %2, %3[%0 : i32] : vector<[16]xi32>
526  %5 = llvm.shufflevector %4, %3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xi32>
527  %6 = arith.cmpi slt, %1, %5 : vector<[16]xi32>
528  %7 = llvm.mlir.undef : vector<[16]xf32>
529  %8 = llvm.insertelement %arg1, %7[%0 : i32] : vector<[16]xf32>
530  %9 = llvm.shufflevector %8, %7 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xf32>
531  vector.transfer_write %9, %arg0[%c0], %6 {in_bounds = [true]} : vector<[16]xf32>, memref<?xf32, strided<[?], offset: ?>>
532  return
533}
534
535// CHECK-SAME:      %[[ARG_0:.*]]: memref<?xf32, strided<[?], offset: ?>>,
536// CHECK-DAG:       %[[C_0:.*]] = arith.constant 0 : index
537// CHECK-DAG:       %[[C_16:.*]] = arith.constant 16 : index
538// CHECK-DAG:       %[[STEP:.*]] = arith.constant 1 : index
539// CHECK:           %[[MASK_VEC:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : vector<[16]xi32>
540// CHECK:           %[[VSCALE:.*]] = vector.vscale
541// CHECK:           %[[UB:.*]] = arith.muli %[[VSCALE]], %[[C_16]] : index
542// CHECK:           scf.for %[[IDX:.*]] = %[[C_0]] to %[[UB]] step %[[STEP]] {
543// CHECK:             %[[MASK_VAL:.*]] = vector.extractelement %[[MASK_VEC]][%[[IDX]] : index] : vector<[16]xi1>
544// CHECK:             scf.if %[[MASK_VAL]] {
545// CHECK:               %[[VAL_TO_STORE:.*]] = vector.extractelement %{{.*}}[%[[IDX]] : index] : vector<[16]xf32>
546// CHECK:               memref.store %[[VAL_TO_STORE]], %[[ARG_0]][%[[IDX]]] : memref<?xf32, strided<[?], offset: ?>>
547// CHECK:             } else {
548// CHECK:             }
549// CHECK:           }
550
551// -----
552
553func.func @vector_print_vector_0d(%arg0: vector<f32>) {
554  vector.print %arg0 : vector<f32>
555  return
556}
557// CHECK-LABEL:   func.func @vector_print_vector_0d(
558// CHECK-SAME:                                      %[[VEC:.*]]: vector<f32>) {
559// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
560// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
561// CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
562// CHECK:           vector.print punctuation <open>
563// CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
564// CHECK:             %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[IDX]] : index] : vector<1xf32>
565// CHECK:             vector.print %[[EL]] : f32 punctuation <no_punctuation>
566// CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index
567// CHECK:             scf.if %[[IS_NOT_LAST]] {
568// CHECK:               vector.print punctuation <comma>
569// CHECK:             }
570// CHECK:           }
571// CHECK:           vector.print punctuation <close>
572// CHECK:           vector.print
573// CHECK:           return
574// CHECK:         }
575
576// -----
577
578func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
579  vector.print %arg0 : vector<2x2xf32>
580  return
581}
582// CHECK-LABEL:   func.func @vector_print_vector(
583// CHECK-SAME:                                   %[[VEC:.*]]: vector<2x2xf32>) {
584// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
585// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
586// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
587// CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32>
588// CHECK:           vector.print punctuation <open>
589// CHECK:           scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
590// CHECK:             vector.print punctuation <open>
591// CHECK:             scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
592// CHECK:               %[[OUTER_INDEX:.*]] = arith.muli %[[I]], %[[C2]] : index
593// CHECK:               %[[FLAT_INDEX:.*]] = arith.addi %[[J]], %[[OUTER_INDEX]] : index
594// CHECK:               %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[FLAT_INDEX]] : index] : vector<4xf32>
595// CHECK:               vector.print %[[EL]] : f32 punctuation <no_punctuation>
596// CHECK:               %[[IS_NOT_LAST_J:.*]] = arith.cmpi ult, %[[J]], %[[C1]] : index
597// CHECK:               scf.if %[[IS_NOT_LAST_J]] {
598// CHECK:                 vector.print punctuation <comma>
599// CHECK:               }
600// CHECK:             }
601// CHECK:             vector.print punctuation <close>
602// CHECK:             %[[IS_NOT_LAST_I:.*]] = arith.cmpi ult, %[[I]], %[[C1]] : index
603// CHECK:             scf.if %[[IS_NOT_LAST_I]] {
604// CHECK:               vector.print punctuation <comma>
605// CHECK:             }
606// CHECK:           }
607// CHECK:           vector.print punctuation <close>
608// CHECK:           vector.print
609// CHECK:           return
610// CHECK:         }
611
612// -----
613
614func.func @vector_print_scalable_vector(%arg0: vector<[4]xi32>) {
615  vector.print %arg0 : vector<[4]xi32>
616  return
617}
618// CHECK-LABEL:   func.func @vector_print_scalable_vector(
619// CHECK-SAME:                                            %[[VEC:.*]]: vector<[4]xi32>) {
620// CHECK:           %[[C0:.*]] = arith.constant 0 : index
621// CHECK:           %[[C4:.*]] = arith.constant 4 : index
622// CHECK:           %[[C1:.*]] = arith.constant 1 : index
623// CHECK:           %[[VSCALE:.*]] = vector.vscale
624// CHECK:           %[[UPPER_BOUND:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
625// CHECK:           %[[LAST_INDEX:.*]] = arith.subi %[[UPPER_BOUND]], %[[C1]] : index
626// CHECK:           vector.print punctuation <open>
627// CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[UPPER_BOUND]] step %[[C1]] {
628// CHECK:             %[[EL:.*]] = vector.extractelement %[[VEC]]{{\[}}%[[IDX]] : index] : vector<[4]xi32>
629// CHECK:             vector.print %[[EL]] : i32 punctuation <no_punctuation>
630// CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[LAST_INDEX]] : index
631// CHECK:             scf.if %[[IS_NOT_LAST]] {
632// CHECK:               vector.print punctuation <comma>
633// CHECK:             }
634// CHECK:           }
635// CHECK:           vector.print punctuation <close>
636// CHECK:           vector.print
637// CHECK:           return
638// CHECK:         }
639
640// -----
641
642func.func @transfer_read_array_of_scalable(%arg0: memref<3x?xf32>) -> vector<3x[4]xf32> {
643  %c0 = arith.constant 0 : index
644  %c1 = arith.constant 1 : index
645  %cst = arith.constant 0.000000e+00 : f32
646  %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
647  %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
648  %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[4]xf32>
649  return %read : vector<3x[4]xf32>
650}
651// CHECK-LABEL:   func.func @transfer_read_array_of_scalable(
652// CHECK-SAME:                                               %[[ARG:.*]]: memref<3x?xf32>) -> vector<3x[4]xf32> {
653// CHECK-DAG:       %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32
654// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
655// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
656// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
657// CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
658// CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
659// CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<3x?xf32>
660// CHECK:           %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
661// CHECK:           memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
662// CHECK:           %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
663// CHECK:           %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
664// CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
665// CHECK:             %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
666// CHECK:             %[[READ_SLICE:.*]] = vector.transfer_read %[[ARG]]{{\[}}%[[VAL_11]], %[[C0]]], %[[PADDING]], %[[MASK_SLICE]] {in_bounds = [true]} : memref<3x?xf32>, vector<[4]xf32>
667// CHECK:             memref.store %[[READ_SLICE]], %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
668// CHECK:           }
669// CHECK:           %[[RESULT:.*]] = memref.load %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
670// CHECK:           return %[[RESULT]] : vector<3x[4]xf32>
671// CHECK:         }
672
673// -----
674
675func.func @transfer_write_array_of_scalable(%vec: vector<3x[4]xf32>, %arg0: memref<3x?xf32>) {
676  %c0 = arith.constant 0 : index
677  %c1 = arith.constant 1 : index
678  %cst = arith.constant 0.000000e+00 : f32
679  %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
680  %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
681  vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<3x[4]xf32>, memref<3x?xf32>
682  return
683}
684// CHECK-LABEL:   func.func @transfer_write_array_of_scalable(
685// CHECK-SAME:                                                %[[VEC:.*]]: vector<3x[4]xf32>,
686// CHECK-SAME:                                                %[[MEMREF:.*]]: memref<3x?xf32>) {
687// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
688// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
689// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
690// CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
691// CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
692// CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[MEMREF]], %[[C1]] : memref<3x?xf32>
693// CHECK:           %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
694// CHECK:           memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
695// CHECK:           memref.store %[[VEC]], %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
696// CHECK:           %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
697// CHECK:           %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
698// CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
699// CHECK:             %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
700// CHECK:             %[[VECTOR_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
701// CHECK:             vector.transfer_write %[[MASK_SLICE]], %[[MEMREF]]{{\[}}%[[VAL_11]], %[[C0]]], %[[VECTOR_SLICE]] {in_bounds = [true]} : vector<[4]xf32>, memref<3x?xf32>
702// CHECK:           }
703// CHECK:           return
704// CHECK:         }
705
706// -----
707
708/// The following two tests currently cannot be lowered via unpacking the leading dim since it is scalable.
709/// It may be possible to special case this via a dynamic dim in future.
710
711func.func @cannot_lower_transfer_write_with_leading_scalable(%vec: vector<[4]x4xf32>, %arg0: memref<?x4xf32>) {
712  %c0 = arith.constant 0 : index
713  %c4 = arith.constant 4 : index
714  %cst = arith.constant 0.000000e+00 : f32
715  %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
716  %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
717  vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
718  return
719}
720// CHECK-LABEL:   func.func @cannot_lower_transfer_write_with_leading_scalable(
721// CHECK-SAME:                                                                 %[[VEC:.*]]: vector<[4]x4xf32>,
722// CHECK-SAME:                                                                 %[[MEMREF:.*]]: memref<?x4xf32>)
723// CHECK: vector.transfer_write %[[VEC]], %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
724
725// -----
726
727func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref<?x4xf32>) -> vector<[4]x4xf32> {
728  %c0 = arith.constant 0 : index
729  %c1 = arith.constant 1 : index
730  %c4 = arith.constant 4 : index
731  %cst = arith.constant 0.000000e+00 : f32
732  %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
733  %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
734  %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
735  return %read : vector<[4]x4xf32>
736}
737// CHECK-LABEL:   func.func @cannot_lower_transfer_read_with_leading_scalable(
738// CHECK-SAME:                                                                %[[MEMREF:.*]]: memref<?x4xf32>)
739// CHECK: %{{.*}} = vector.transfer_read %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
740
741//  -----
742
743// Check that the `TransferOpConversion` generates valid indices for the LoadOp.
744
745#map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)>
746func.func @does_not_crash_on_unpack_one_dim(%subview:  memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
747  %c0 = arith.constant 0 : index
748  %c0_i32 = arith.constant 0 : i32
749  %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1}
750          : memref<1x1x1x1xi32>, vector<1x1x1x1xi32>
751  return %3 : vector<1x1x1x1xi32>
752}
753// CHECK-LABEL: func.func @does_not_crash_on_unpack_one_dim
754// CHECK: %[[ALLOCA_0:.*]] = memref.alloca() : memref<vector<1x1xi1>>
755// CHECK: %[[MASK:.*]] = vector.type_cast %[[ALLOCA_0]] : memref<vector<1x1xi1>> to memref<1xvector<1xi1>>
756// CHECK: memref.load %[[MASK]][%{{.*}}] : memref<1xvector<1xi1>>
757
758//  -----
759
760// Check that the `TransferOpConversion` generates valid indices for the StoreOp.
761// This test is pulled from an integration test for ArmSVE.
762
763func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) -> vector<1x2x[4]xf32> {
764  %c0 = arith.constant 0 : index
765  %c2 = arith.constant 2 : index
766  %c3 = arith.constant 2 : index
767  %cst = arith.constant 0.000000e+00 : f32
768  %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
769  %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
770  %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
771  return %vector_a : vector<1x2x[4]xf32>
772}
773// CHECK-LABEL: func.func @add_arrays_of_scalable_vectors
774// CHECK: scf.for
775// CHECK: scf.for
776// CHECK: memref.load
777
778//  -----
779
780// FULL-UNROLL-LABEL: @cannot_fully_unroll_transfer_write_of_nd_scalable_vector
781func.func @cannot_fully_unroll_transfer_write_of_nd_scalable_vector(%vec: vector<[4]x[4]xf32>, %memref: memref<?x?xf32>) {
782  // FULL-UNROLL-NOT: vector.extract
783  // FULL-UNROLL: vector.transfer_write {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
784  // FULL-UNROLL-NOT: vector.extract
785  %c0 = arith.constant 0 : index
786  vector.transfer_write %vec, %memref[%c0, %c0] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
787  return
788}
789
790//  -----
791
792// TARGET-RANK-ZERO-LABEL: func @unroll_transfer_write_target_rank_zero
793func.func @unroll_transfer_write_target_rank_zero(%vec : vector<2xi32>) {
794  %alloc = memref.alloc() : memref<4xi32>
795  %c0 = arith.constant 0 : index
796  vector.transfer_write %vec, %alloc[%c0] : vector<2xi32>, memref<4xi32>
797  return
798}
799// TARGET-RANK-ZERO: %[[ALLOC:.*]] = memref.alloc() : memref<4xi32>
800// TARGET-RANK-ZERO: %[[EXTRACTED1:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
801// TARGET-RANK-ZERO: %[[BROADCASTED1:.*]] = vector.broadcast %[[EXTRACTED1]] : i32 to vector<i32>
802// TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED1]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
803// TARGET-RANK-ZERO: %[[EXTRACTED2:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
804// TARGET-RANK-ZERO: %[[BROADCASTED2:.*]] = vector.broadcast %[[EXTRACTED2]] : i32 to vector<i32>
805// TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED2]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
806
807// -----
808
809func.func @scalable_transpose_store_unmasked(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
810  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
811  vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
812  return
813}
814// FULL-UNROLL: #[[$SLICE_MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
815// FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_unmasked(
816// FULL-UNROLL-SAME:                                                 %[[VEC:.*]]: vector<4x[4]xf32>,
817// FULL-UNROLL-SAME:                                                 %[[DEST:.*]]: memref<?x?xf32>,
818// FULL-UNROLL-SAME:                                                 %[[I:.*]]: index,
819// FULL-UNROLL-SAME:                                                 %[[J:.*]]: index)
820// FULL-UNROLL-DAG:       %[[C0:.*]] = arith.constant 0 : index
821// FULL-UNROLL-DAG:       %[[C1:.*]] = arith.constant 1 : index
822// FULL-UNROLL-DAG:       %[[C4:.*]] = arith.constant 4 : index
823// FULL-UNROLL:           %[[SLICE_0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<4x[4]xf32>
824// FULL-UNROLL:           %[[SLICE_1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<4x[4]xf32>
825// FULL-UNROLL:           %[[SLICE_2:.*]] = vector.extract %[[VEC]][2] : vector<[4]xf32> from vector<4x[4]xf32>
826// FULL-UNROLL:           %[[SLICE_3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<4x[4]xf32>
827// FULL-UNROLL:           %[[VSCALE:.*]] = vector.vscale
828// FULL-UNROLL:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
829// FULL-UNROLL:           scf.for %[[VAL_13:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] {
830// FULL-UNROLL:             %[[SLICE_I:.*]] = affine.apply #[[$SLICE_MAP]](%[[VAL_13]]){{\[}}%[[I]]]
831// FULL-UNROLL:             %[[ELEM_0:.*]] = vector.extract %[[SLICE_0]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
832// FULL-UNROLL:             %[[ELEM_1:.*]] = vector.extract %[[SLICE_1]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
833// FULL-UNROLL:             %[[ELEM_2:.*]] = vector.extract %[[SLICE_2]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
834// FULL-UNROLL:             %[[ELEM_3:.*]] = vector.extract %[[SLICE_3]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
835// FULL-UNROLL:             %[[TRANSPOSE_SLICE:.*]] = vector.from_elements %[[ELEM_0]], %[[ELEM_1]], %[[ELEM_2]], %[[ELEM_3]] : vector<4xf32>
836// FULL-UNROLL:             vector.transfer_write %[[TRANSPOSE_SLICE]], %[[DEST]]{{\[}}%[[SLICE_I]], %[[J]]] {in_bounds = [true]} : vector<4xf32>, memref<?x?xf32>
837
838// -----
839
840func.func @scalable_transpose_store_dynamic_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index, %a: index, %b: index) {
841  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
842  %mask = vector.create_mask %a, %b : vector<[4]x4xi1>
843  vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
844  return
845}
846// FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_dynamic_mask(
847// FULL-UNROLL-SAME:                                                     %{{.*}}, %[[A:.*]]: index, %[[B:.*]]: index)
848// FULL-UNROLL:           %[[SLICE_MASK:.*]] = vector.create_mask %[[B]] : vector<4xi1>
849// FULL-UNROLL:           scf.for %{{.*}} to %[[A]]
850// FULL-UNROLL:             vector.transfer_write {{.*}}, %[[SLICE_MASK]]
851
852// -----
853
854func.func @scalable_transpose_store_constant_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
855  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
856  %mask = vector.constant_mask [4, 3] : vector<[4]x4xi1>
857  vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
858  return
859}
860// FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_constant_mask
861// FULL-UNROLL:           %[[C3:.*]] = arith.constant 3 : index
862// FULL-UNROLL:           %[[C4:.*]] = arith.constant 4 : index
863// FULL-UNROLL:           %[[VSCALE:.*]] = vector.vscale
864// FULL-UNROLL:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
865// FULL-UNROLL:           %[[SLICE_MASK:.*]] = vector.create_mask %[[C3]] : vector<4xi1>
866// FULL-UNROLL:           scf.for %{{.*}} to %[[C4_VSCALE]]
867// FULL-UNROLL:             vector.transfer_write {{.*}}, %[[SLICE_MASK]]
868
869// -----
870
871/// Unsupported transpose.
872func.func @negative_scalable_transpose_store_0(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
873  %transpose = vector.transpose %vec, [1, 0] : vector<[4]x4xf32> to vector<4x[4]xf32>
874  vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<4x[4]xf32>,  memref<?x?xf32>
875  return
876}
877// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_0
878// FULL-UNROLL-NOT:   scf.for
879
880// -----
881
882/// Non-identity permutation map (should be lowered first).
883func.func @negative_scalable_transpose_store_1(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
884  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
885  vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true], permutation_map = affine_map<(d0,d1) -> (d1, d0)> } : vector<[4]x4xf32>,  memref<?x?xf32>
886  return
887}
888// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_1
889// FULL-UNROLL-NOT:   scf.for
890
891
892// -----
893
894/// Out-of-bounds dim.
895func.func @negative_scalable_transpose_store_2(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
896  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
897  vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [false, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
898  return
899}
900// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_2
901// FULL-UNROLL-NOT:   scf.for
902
903// -----
904
905/// Source not a vector.transpose.
906func.func @negative_scalable_transpose_store_3(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
907  vector.transfer_write %vec, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
908  return
909}
910// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_3
911// FULL-UNROLL-NOT:   scf.for
912