1// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf))" -split-input-file -allow-unregistered-dialect | FileCheck %s 2// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true lower-scalable=true}))" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL 3// RUN: mlir-opt %s "-convert-vector-to-scf=full-unroll target-rank=0" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=TARGET-RANK-ZERO 4 5// CHECK-LABEL: func @vector_transfer_ops_0d( 6func.func @vector_transfer_ops_0d(%M: memref<f32>) { 7 %f0 = arith.constant 0.0 : f32 8 9 // 0-d transfers are left untouched by vector-to-scf. 10 // They are independently lowered to the proper memref.load/store. 11 // CHECK: vector.transfer_read {{.*}}: memref<f32>, vector<f32> 12 %0 = vector.transfer_read %M[], %f0 {permutation_map = affine_map<()->()>} : 13 memref<f32>, vector<f32> 14 15 // CHECK: vector.transfer_write {{.*}}: vector<f32>, memref<f32> 16 vector.transfer_write %0, %M[] {permutation_map = affine_map<()->()>} : 17 vector<f32>, memref<f32> 18 19 return 20} 21 22// ----- 23 24// CHECK-LABEL: func @materialize_read_1d() { 25func.func @materialize_read_1d() { 26 %f0 = arith.constant 0.0: f32 27 %A = memref.alloc () : memref<7x42xf32> 28 affine.for %i0 = 0 to 7 step 4 { 29 affine.for %i1 = 0 to 42 step 4 { 30 %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> 31 %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1) 32 %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> 33 %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1) 34 %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> 35 %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1) 36 %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32> 37 // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds. 38 // CHECK: scf.if 39 // CHECK-NEXT: memref.load 40 // CHECK-NEXT: vector.insertelement 41 // CHECK-NEXT: scf.yield 42 // CHECK-NEXT: else 43 // CHECK-NEXT: scf.yield 44 // Add a dummy use to prevent dead code elimination from removing transfer 45 // read ops. 46 "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () 47 } 48 } 49 return 50} 51 52// ----- 53 54// CHECK-LABEL: func @materialize_read_1d_partially_specialized 55func.func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) { 56 %f0 = arith.constant 0.0: f32 57 %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32> 58 affine.for %i0 = 0 to 7 { 59 affine.for %i1 = 0 to %dyn1 { 60 affine.for %i2 = 0 to %dyn2 { 61 affine.for %i3 = 0 to 42 step 2 { 62 affine.for %i4 = 0 to %dyn4 { 63 %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> 64 %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3) 65 %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32> 66 // Add a dummy use to prevent dead code elimination from removing 67 // transfer read ops. 68 "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> () 69 } 70 } 71 } 72 } 73 } 74 // CHECK: %[[tensor:[0-9a-zA-Z_]+]] = memref.alloc 75 // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0 76 // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3 77 return 78} 79 80// ----- 81 82// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)> 83 84// CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { 85func.func @materialize_read(%M: index, %N: index, %O: index, %P: index) { 86 %f0 = arith.constant 0.0: f32 87 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 88 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 89 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 90 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index 91 // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index 92 // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32> 93 // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { 94 // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} { 95 // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { 96 // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { 97 // CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>> 98 // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { 99 // CHECK: scf.if 100 // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]]) 101 // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { 102 // CHECK: %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) { 103 // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]]) 104 // CHECK: scf.if {{.*}} -> (vector<3xf32>) { 105 // CHECK-NEXT: %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32> 106 // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[I6]] : index] : vector<3xf32> 107 // CHECK-NEXT: scf.yield 108 // CHECK-NEXT: } else { 109 // CHECK-NEXT: scf.yield 110 // CHECK-NEXT: } 111 // CHECK-NEXT: scf.yield 112 // CHECK-NEXT: } 113 // CHECK-NEXT: memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>> 114 // CHECK-NEXT: } 115 // CHECK-NEXT: } else { 116 // CHECK-NEXT: memref.store {{.*}} : memref<5xvector<4x3xf32>> 117 // CHECK-NEXT: } 118 // CHECK-NEXT: } 119 // CHECK-NEXT: %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>> 120 // CHECK-NEXT: "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> () 121 // CHECK-NEXT: } 122 // CHECK-NEXT: } 123 // CHECK-NEXT: } 124 // CHECK-NEXT: } 125 // CHECK-NEXT: return 126 // CHECK-NEXT:} 127 128 // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4) 129 // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...) 130 // Other dimensions are just accessed with I1, I2 resp. 131 %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0> 132 affine.for %i0 = 0 to %M step 3 { 133 affine.for %i1 = 0 to %N { 134 affine.for %i2 = 0 to %O { 135 affine.for %i3 = 0 to %P step 5 { 136 %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32> 137 // Add a dummy use to prevent dead code elimination from removing 138 // transfer read ops. 139 "dummy_use"(%f) : (vector<5x4x3xf32>) -> () 140 } 141 } 142 } 143 } 144 return 145} 146 147// ----- 148 149// CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)> 150 151// CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { 152func.func @materialize_write(%M: index, %N: index, %O: index, %P: index) { 153 // CHECK-DAG: %{{.*}} = arith.constant dense<1.000000e+00> : vector<3x4x1x5xf32> 154 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 155 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 156 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 157 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index 158 // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32> 159 // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { 160 // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 { 161 // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { 162 // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 { 163 // CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<3x4x1x5xf32>> 164 // CHECK: memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<3x4x1x5xf32>> 165 // CHECK: %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<3x4x1x5xf32>> to memref<3xvector<4x1x5xf32>> 166 // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { 167 // CHECK: scf.if 168 // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]]) 169 // CHECK: %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<3xvector<4x1x5xf32>> to memref<3x4xvector<1x5xf32>> 170 // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { 171 // CHECK: scf.if 172 // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]]) 173 // CHECK: %[[VECTOR_VIEW3:.*]] = vector.type_cast %[[VECTOR_VIEW2]] : memref<3x4xvector<1x5xf32>> to memref<3x4x1xvector<5xf32>> 174 // CHECK: scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] { 175 // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]]) 176 // CHECK: %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>> 177 // CHECK: vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref<?x?x?x?xf32> 178 // CHECK: } 179 // CHECK: } 180 // CHECK: } 181 // CHECK: } 182 // CHECK: } 183 // CHECK: } 184 // CHECK: } 185 // CHECK: } 186 // CHECK: } 187 // CHECK: return 188 189 // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...) 190 // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...) 191 // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3) 192 // Other dimension is just accessed with I2. 193 %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0> 194 %f1 = arith.constant dense<1.000000e+00> : vector<5x4x3xf32> 195 affine.for %i0 = 0 to %M step 3 { 196 affine.for %i1 = 0 to %N step 4 { 197 affine.for %i2 = 0 to %O { 198 affine.for %i3 = 0 to %P step 5 { 199 vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32> 200 } 201 } 202 } 203 } 204 return 205} 206 207// ----- 208 209// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> 210 211// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> 212// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> 213 214 215// CHECK-LABEL: transfer_read_progressive( 216// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 217// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index 218 219// FULL-UNROLL-LABEL: transfer_read_progressive( 220// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 221// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index 222 223func.func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x15xf32> { 224 %f7 = arith.constant 7.0: f32 225 // CHECK-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 226 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 227 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 228 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 229 // CHECK-DAG: %[[splat:.*]] = arith.constant dense<7.000000e+00> : vector<15xf32> 230 // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>> 231 // CHECK: %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>> 232 // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] 233 // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32> 234 // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] 235 // CHECK: %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index 236 // CHECK: scf.if %[[cond1]] { 237 // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32> 238 // CHECK: memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> 239 // CHECK: } else { 240 // CHECK: store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>> 241 // CHECK: } 242 // CHECK: } 243 // CHECK: %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>> 244 245 // FULL-UNROLL-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32 246 // FULL-UNROLL-DAG: %[[VEC0:.*]] = arith.constant dense<7.000000e+00> : vector<3x15xf32> 247 // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index 248 // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32> 249 // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index 250 // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { 251 // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32> 252 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> 253 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 254 // FULL-UNROLL: } else { 255 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 256 // FULL-UNROLL: } 257 // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] 258 // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index 259 // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { 260 // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32> 261 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> 262 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 263 // FULL-UNROLL: } else { 264 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 265 // FULL-UNROLL: } 266 // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] 267 // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index 268 // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { 269 // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32> 270 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> 271 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 272 // FULL-UNROLL: } else { 273 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> 274 // FULL-UNROLL: } 275 276 %f = vector.transfer_read %A[%base, %base], %f7 : 277 memref<?x?xf32>, vector<3x15xf32> 278 279 return %f: vector<3x15xf32> 280} 281 282// ----- 283 284// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> 285 286// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> 287// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> 288 289// CHECK-LABEL: transfer_write_progressive( 290// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 291// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index, 292// CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> 293// FULL-UNROLL-LABEL: transfer_write_progressive( 294// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 295// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index, 296// FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> 297func.func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) { 298 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 299 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 300 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 301 // CHECK: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>> 302 // CHECK: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>> 303 // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>> 304 // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] 305 // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32> 306 // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] 307 // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index 308 // CHECK: scf.if %[[cmp]] { 309 // CHECK: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> 310 // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32> 311 // CHECK: } 312 // CHECK: } 313 314 // FULL-UNROLL: %[[C0:.*]] = arith.constant 0 : index 315 // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32> 316 // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index 317 // FULL-UNROLL: scf.if %[[CMP0]] { 318 // FULL-UNROLL: %[[V0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32> 319 // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32> 320 // FULL-UNROLL: } 321 // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] 322 // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index 323 // FULL-UNROLL: scf.if %[[CMP1]] { 324 // FULL-UNROLL: %[[V1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32> 325 // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32> 326 // FULL-UNROLL: } 327 // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] 328 // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index 329 // FULL-UNROLL: scf.if %[[CMP2]] { 330 // FULL-UNROLL: %[[V2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32> 331 // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32> 332 // FULL-UNROLL: } 333 334 vector.transfer_write %vec, %A[%base, %base] : 335 vector<3x15xf32>, memref<?x?xf32> 336 return 337} 338 339// ----- 340 341// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> 342 343// FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)> 344// FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)> 345 346// CHECK-LABEL: transfer_write_progressive_inbounds( 347// CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 348// CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index, 349// CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> 350// FULL-UNROLL-LABEL: transfer_write_progressive_inbounds( 351// FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>, 352// FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index, 353// FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32> 354func.func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) { 355 // CHECK-NOT: scf.if 356 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 357 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 358 // CHECK: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>> 359 // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>> 360 // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>> 361 // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]] 362 // CHECK-NEXT: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] 363 // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>> 364 // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32> 365 366 // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32> 367 // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32> 368 // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]] 369 // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32> 370 // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32> 371 // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]] 372 // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32> 373 // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32> 374 vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} : 375 vector<3x15xf32>, memref<?x?xf32> 376 return 377} 378 379// ----- 380 381// FULL-UNROLL-LABEL: transfer_read_simple 382func.func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> { 383 %c0 = arith.constant 0 : index 384 %f0 = arith.constant 0.0 : f32 385 // FULL-UNROLL-DAG: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf32> 386 // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index 387 // FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index 388 // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]] 389 // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32> 390 // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]] 391 // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32> 392 %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32> 393 return %0 : vector<2x2xf32> 394} 395 396func.func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32> { 397 %c0 = arith.constant 0 : index 398 %f0 = arith.constant 0.0 : f32 399 %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0 400 { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } 401 : memref<?x?x?x?xf32>, vector<3x3xf32> 402 return %0 : vector<3x3xf32> 403} 404 405// CHECK-LABEL: transfer_read_minor_identity( 406// CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32> 407// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 408// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 409// CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index 410// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index 411// CHECK-DAG: %[[f0:.*]] = arith.constant 0.000000e+00 : f32 412// CHECK-DAG: %[[cst0:.*]] = arith.constant dense<0.000000e+00> : vector<3xf32> 413// CHECK: %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>> 414// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>> 415// CHECK: scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]] 416// CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32> 417// CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index 418// CHECK: scf.if %[[cmp]] { 419// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32> 420// CHECK: memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> 421// CHECK: } else { 422// CHECK: memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>> 423// CHECK: } 424// CHECK: } 425// CHECK: %[[ret:.*]] = memref.load %[[m]][] : memref<vector<3x3xf32>> 426// CHECK: return %[[ret]] : vector<3x3xf32> 427 428func.func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) { 429 %c0 = arith.constant 0 : index 430 %f0 = arith.constant 0.0 : f32 431 vector.transfer_write %A, %B[%c0, %c0, %c0, %c0] 432 { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> } 433 : vector<3x3xf32>, memref<?x?x?x?xf32> 434 return 435} 436 437// CHECK-LABEL: transfer_write_minor_identity( 438// CHECK-SAME: %[[A:.*]]: vector<3x3xf32>, 439// CHECK-SAME: %[[B:.*]]: memref<?x?x?x?xf32>) 440// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 441// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 442// CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index 443// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index 444// CHECK: %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>> 445// CHECK: memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>> 446// CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>> 447// CHECK: scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]] 448// CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32> 449// CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index 450// CHECK: scf.if %[[cmp]] { 451// CHECK: %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>> 452// CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32> 453// CHECK: } 454// CHECK: } 455// CHECK: return 456 457 458// ----- 459 460func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> { 461 %c0 = arith.constant 0 : index 462 %f0 = arith.constant 0.0 : f32 463 %0 = vector.transfer_read %A[%c0, %c0], %f0 464 : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32> 465 return %0 : vector<4xf32> 466} 467 468// CHECK-LABEL: transfer_read_strided( 469// CHECK: scf.for 470// CHECK: memref.load 471 472func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) { 473 %c0 = arith.constant 0 : index 474 vector.transfer_write %A, %B[%c0, %c0] : 475 vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>> 476 return 477} 478 479// CHECK-LABEL: transfer_write_strided( 480// CHECK: scf.for 481// CHECK: store 482 483// ----- 484 485func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> () 486 487// CHECK-LABEL: transfer_read_within_async_execute 488func.func @transfer_read_within_async_execute(%A : memref<2x2xf32>) -> !async.token { 489 %c0 = arith.constant 0 : index 490 %f0 = arith.constant 0.0 : f32 491 // CHECK-NOT: alloca 492 // CHECK: async.execute 493 // CHECK: alloca 494 %token = async.execute { 495 %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32> 496 func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> () 497 async.yield 498 } 499 return %token : !async.token 500} 501 502// ----- 503 504// CHECK-LABEL: transfer_read_with_tensor 505func.func @transfer_read_with_tensor(%arg: tensor<f32>) -> vector<1xf32> { 506 // CHECK: %[[EXTRACTED:.*]] = vector.transfer_read %{{.*}}[], %{{.*}} : tensor<f32>, vector<f32> 507 // CHECK-NEXT: %[[RESULT:.*]] = vector.broadcast %[[EXTRACTED]] : vector<f32> to vector<1xf32> 508 // CHECK-NEXT: return %[[RESULT]] : vector<1xf32> 509 %f0 = arith.constant 0.0 : f32 510 %0 = vector.transfer_read %arg[], %f0 {permutation_map = affine_map<()->(0)>} : 511 tensor<f32>, vector<1xf32> 512 return %0: vector<1xf32> 513} 514 515// ----- 516 517// CHECK-LABEL: transfer_write_scalable 518func.func @transfer_write_scalable(%arg0: memref<?xf32, strided<[?], offset: ?>>, %arg1: f32) { 519 %0 = llvm.mlir.constant(0 : i32) : i32 520 %c0 = arith.constant 0 : index 521 %dim = memref.dim %arg0, %c0 : memref<?xf32, strided<[?], offset: ?>> 522 %1 = llvm.intr.stepvector : vector<[16]xi32> 523 %2 = arith.index_cast %dim : index to i32 524 %3 = llvm.mlir.undef : vector<[16]xi32> 525 %4 = llvm.insertelement %2, %3[%0 : i32] : vector<[16]xi32> 526 %5 = llvm.shufflevector %4, %3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xi32> 527 %6 = arith.cmpi slt, %1, %5 : vector<[16]xi32> 528 %7 = llvm.mlir.undef : vector<[16]xf32> 529 %8 = llvm.insertelement %arg1, %7[%0 : i32] : vector<[16]xf32> 530 %9 = llvm.shufflevector %8, %7 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xf32> 531 vector.transfer_write %9, %arg0[%c0], %6 {in_bounds = [true]} : vector<[16]xf32>, memref<?xf32, strided<[?], offset: ?>> 532 return 533} 534 535// CHECK-SAME: %[[ARG_0:.*]]: memref<?xf32, strided<[?], offset: ?>>, 536// CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index 537// CHECK-DAG: %[[C_16:.*]] = arith.constant 16 : index 538// CHECK-DAG: %[[STEP:.*]] = arith.constant 1 : index 539// CHECK: %[[MASK_VEC:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : vector<[16]xi32> 540// CHECK: %[[VSCALE:.*]] = vector.vscale 541// CHECK: %[[UB:.*]] = arith.muli %[[VSCALE]], %[[C_16]] : index 542// CHECK: scf.for %[[IDX:.*]] = %[[C_0]] to %[[UB]] step %[[STEP]] { 543// CHECK: %[[MASK_VAL:.*]] = vector.extractelement %[[MASK_VEC]][%[[IDX]] : index] : vector<[16]xi1> 544// CHECK: scf.if %[[MASK_VAL]] { 545// CHECK: %[[VAL_TO_STORE:.*]] = vector.extractelement %{{.*}}[%[[IDX]] : index] : vector<[16]xf32> 546// CHECK: memref.store %[[VAL_TO_STORE]], %[[ARG_0]][%[[IDX]]] : memref<?xf32, strided<[?], offset: ?>> 547// CHECK: } else { 548// CHECK: } 549// CHECK: } 550 551// ----- 552 553func.func @vector_print_vector_0d(%arg0: vector<f32>) { 554 vector.print %arg0 : vector<f32> 555 return 556} 557// CHECK-LABEL: func.func @vector_print_vector_0d( 558// CHECK-SAME: %[[VEC:.*]]: vector<f32>) { 559// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 560// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 561// CHECK: %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32> 562// CHECK: vector.print punctuation <open> 563// CHECK: scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] { 564// CHECK: %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[IDX]] : index] : vector<1xf32> 565// CHECK: vector.print %[[EL]] : f32 punctuation <no_punctuation> 566// CHECK: %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index 567// CHECK: scf.if %[[IS_NOT_LAST]] { 568// CHECK: vector.print punctuation <comma> 569// CHECK: } 570// CHECK: } 571// CHECK: vector.print punctuation <close> 572// CHECK: vector.print 573// CHECK: return 574// CHECK: } 575 576// ----- 577 578func.func @vector_print_vector(%arg0: vector<2x2xf32>) { 579 vector.print %arg0 : vector<2x2xf32> 580 return 581} 582// CHECK-LABEL: func.func @vector_print_vector( 583// CHECK-SAME: %[[VEC:.*]]: vector<2x2xf32>) { 584// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 585// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 586// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 587// CHECK: %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32> 588// CHECK: vector.print punctuation <open> 589// CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] { 590// CHECK: vector.print punctuation <open> 591// CHECK: scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] { 592// CHECK: %[[OUTER_INDEX:.*]] = arith.muli %[[I]], %[[C2]] : index 593// CHECK: %[[FLAT_INDEX:.*]] = arith.addi %[[J]], %[[OUTER_INDEX]] : index 594// CHECK: %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[FLAT_INDEX]] : index] : vector<4xf32> 595// CHECK: vector.print %[[EL]] : f32 punctuation <no_punctuation> 596// CHECK: %[[IS_NOT_LAST_J:.*]] = arith.cmpi ult, %[[J]], %[[C1]] : index 597// CHECK: scf.if %[[IS_NOT_LAST_J]] { 598// CHECK: vector.print punctuation <comma> 599// CHECK: } 600// CHECK: } 601// CHECK: vector.print punctuation <close> 602// CHECK: %[[IS_NOT_LAST_I:.*]] = arith.cmpi ult, %[[I]], %[[C1]] : index 603// CHECK: scf.if %[[IS_NOT_LAST_I]] { 604// CHECK: vector.print punctuation <comma> 605// CHECK: } 606// CHECK: } 607// CHECK: vector.print punctuation <close> 608// CHECK: vector.print 609// CHECK: return 610// CHECK: } 611 612// ----- 613 614func.func @vector_print_scalable_vector(%arg0: vector<[4]xi32>) { 615 vector.print %arg0 : vector<[4]xi32> 616 return 617} 618// CHECK-LABEL: func.func @vector_print_scalable_vector( 619// CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32>) { 620// CHECK: %[[C0:.*]] = arith.constant 0 : index 621// CHECK: %[[C4:.*]] = arith.constant 4 : index 622// CHECK: %[[C1:.*]] = arith.constant 1 : index 623// CHECK: %[[VSCALE:.*]] = vector.vscale 624// CHECK: %[[UPPER_BOUND:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index 625// CHECK: %[[LAST_INDEX:.*]] = arith.subi %[[UPPER_BOUND]], %[[C1]] : index 626// CHECK: vector.print punctuation <open> 627// CHECK: scf.for %[[IDX:.*]] = %[[C0]] to %[[UPPER_BOUND]] step %[[C1]] { 628// CHECK: %[[EL:.*]] = vector.extractelement %[[VEC]]{{\[}}%[[IDX]] : index] : vector<[4]xi32> 629// CHECK: vector.print %[[EL]] : i32 punctuation <no_punctuation> 630// CHECK: %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[LAST_INDEX]] : index 631// CHECK: scf.if %[[IS_NOT_LAST]] { 632// CHECK: vector.print punctuation <comma> 633// CHECK: } 634// CHECK: } 635// CHECK: vector.print punctuation <close> 636// CHECK: vector.print 637// CHECK: return 638// CHECK: } 639 640// ----- 641 642func.func @transfer_read_array_of_scalable(%arg0: memref<3x?xf32>) -> vector<3x[4]xf32> { 643 %c0 = arith.constant 0 : index 644 %c1 = arith.constant 1 : index 645 %cst = arith.constant 0.000000e+00 : f32 646 %dim = memref.dim %arg0, %c1 : memref<3x?xf32> 647 %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1> 648 %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[4]xf32> 649 return %read : vector<3x[4]xf32> 650} 651// CHECK-LABEL: func.func @transfer_read_array_of_scalable( 652// CHECK-SAME: %[[ARG:.*]]: memref<3x?xf32>) -> vector<3x[4]xf32> { 653// CHECK-DAG: %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32 654// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 655// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 656// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 657// CHECK: %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>> 658// CHECK: %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>> 659// CHECK: %[[DIM_SIZE:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<3x?xf32> 660// CHECK: %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1> 661// CHECK: memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>> 662// CHECK: %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>> 663// CHECK: %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>> 664// CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { 665// CHECK: %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>> 666// CHECK: %[[READ_SLICE:.*]] = vector.transfer_read %[[ARG]]{{\[}}%[[VAL_11]], %[[C0]]], %[[PADDING]], %[[MASK_SLICE]] {in_bounds = [true]} : memref<3x?xf32>, vector<[4]xf32> 667// CHECK: memref.store %[[READ_SLICE]], %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>> 668// CHECK: } 669// CHECK: %[[RESULT:.*]] = memref.load %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>> 670// CHECK: return %[[RESULT]] : vector<3x[4]xf32> 671// CHECK: } 672 673// ----- 674 675func.func @transfer_write_array_of_scalable(%vec: vector<3x[4]xf32>, %arg0: memref<3x?xf32>) { 676 %c0 = arith.constant 0 : index 677 %c1 = arith.constant 1 : index 678 %cst = arith.constant 0.000000e+00 : f32 679 %dim = memref.dim %arg0, %c1 : memref<3x?xf32> 680 %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1> 681 vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<3x[4]xf32>, memref<3x?xf32> 682 return 683} 684// CHECK-LABEL: func.func @transfer_write_array_of_scalable( 685// CHECK-SAME: %[[VEC:.*]]: vector<3x[4]xf32>, 686// CHECK-SAME: %[[MEMREF:.*]]: memref<3x?xf32>) { 687// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 688// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index 689// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 690// CHECK: %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>> 691// CHECK: %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>> 692// CHECK: %[[DIM_SIZE:.*]] = memref.dim %[[MEMREF]], %[[C1]] : memref<3x?xf32> 693// CHECK: %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1> 694// CHECK: memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>> 695// CHECK: memref.store %[[VEC]], %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>> 696// CHECK: %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>> 697// CHECK: %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>> 698// CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { 699// CHECK: %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>> 700// CHECK: %[[VECTOR_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>> 701// CHECK: vector.transfer_write %[[MASK_SLICE]], %[[MEMREF]]{{\[}}%[[VAL_11]], %[[C0]]], %[[VECTOR_SLICE]] {in_bounds = [true]} : vector<[4]xf32>, memref<3x?xf32> 702// CHECK: } 703// CHECK: return 704// CHECK: } 705 706// ----- 707 708/// The following two tests currently cannot be lowered via unpacking the leading dim since it is scalable. 709/// It may be possible to special case this via a dynamic dim in future. 710 711func.func @cannot_lower_transfer_write_with_leading_scalable(%vec: vector<[4]x4xf32>, %arg0: memref<?x4xf32>) { 712 %c0 = arith.constant 0 : index 713 %c4 = arith.constant 4 : index 714 %cst = arith.constant 0.000000e+00 : f32 715 %dim = memref.dim %arg0, %c0 : memref<?x4xf32> 716 %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1> 717 vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32> 718 return 719} 720// CHECK-LABEL: func.func @cannot_lower_transfer_write_with_leading_scalable( 721// CHECK-SAME: %[[VEC:.*]]: vector<[4]x4xf32>, 722// CHECK-SAME: %[[MEMREF:.*]]: memref<?x4xf32>) 723// CHECK: vector.transfer_write %[[VEC]], %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32> 724 725// ----- 726 727func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref<?x4xf32>) -> vector<[4]x4xf32> { 728 %c0 = arith.constant 0 : index 729 %c1 = arith.constant 1 : index 730 %c4 = arith.constant 4 : index 731 %cst = arith.constant 0.000000e+00 : f32 732 %dim = memref.dim %arg0, %c0 : memref<?x4xf32> 733 %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1> 734 %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32> 735 return %read : vector<[4]x4xf32> 736} 737// CHECK-LABEL: func.func @cannot_lower_transfer_read_with_leading_scalable( 738// CHECK-SAME: %[[MEMREF:.*]]: memref<?x4xf32>) 739// CHECK: %{{.*}} = vector.transfer_read %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32> 740 741// ----- 742 743// Check that the `TransferOpConversion` generates valid indices for the LoadOp. 744 745#map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)> 746func.func @does_not_crash_on_unpack_one_dim(%subview: memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> { 747 %c0 = arith.constant 0 : index 748 %c0_i32 = arith.constant 0 : i32 749 %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1} 750 : memref<1x1x1x1xi32>, vector<1x1x1x1xi32> 751 return %3 : vector<1x1x1x1xi32> 752} 753// CHECK-LABEL: func.func @does_not_crash_on_unpack_one_dim 754// CHECK: %[[ALLOCA_0:.*]] = memref.alloca() : memref<vector<1x1xi1>> 755// CHECK: %[[MASK:.*]] = vector.type_cast %[[ALLOCA_0]] : memref<vector<1x1xi1>> to memref<1xvector<1xi1>> 756// CHECK: memref.load %[[MASK]][%{{.*}}] : memref<1xvector<1xi1>> 757 758// ----- 759 760// Check that the `TransferOpConversion` generates valid indices for the StoreOp. 761// This test is pulled from an integration test for ArmSVE. 762 763func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) -> vector<1x2x[4]xf32> { 764 %c0 = arith.constant 0 : index 765 %c2 = arith.constant 2 : index 766 %c3 = arith.constant 2 : index 767 %cst = arith.constant 0.000000e+00 : f32 768 %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32> 769 %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1> 770 %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32> 771 return %vector_a : vector<1x2x[4]xf32> 772} 773// CHECK-LABEL: func.func @add_arrays_of_scalable_vectors 774// CHECK: scf.for 775// CHECK: scf.for 776// CHECK: memref.load 777 778// ----- 779 780// FULL-UNROLL-LABEL: @cannot_fully_unroll_transfer_write_of_nd_scalable_vector 781func.func @cannot_fully_unroll_transfer_write_of_nd_scalable_vector(%vec: vector<[4]x[4]xf32>, %memref: memref<?x?xf32>) { 782 // FULL-UNROLL-NOT: vector.extract 783 // FULL-UNROLL: vector.transfer_write {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32> 784 // FULL-UNROLL-NOT: vector.extract 785 %c0 = arith.constant 0 : index 786 vector.transfer_write %vec, %memref[%c0, %c0] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32> 787 return 788} 789 790// ----- 791 792// TARGET-RANK-ZERO-LABEL: func @unroll_transfer_write_target_rank_zero 793func.func @unroll_transfer_write_target_rank_zero(%vec : vector<2xi32>) { 794 %alloc = memref.alloc() : memref<4xi32> 795 %c0 = arith.constant 0 : index 796 vector.transfer_write %vec, %alloc[%c0] : vector<2xi32>, memref<4xi32> 797 return 798} 799// TARGET-RANK-ZERO: %[[ALLOC:.*]] = memref.alloc() : memref<4xi32> 800// TARGET-RANK-ZERO: %[[EXTRACTED1:.*]] = vector.extract {{.*}} : i32 from vector<2xi32> 801// TARGET-RANK-ZERO: %[[BROADCASTED1:.*]] = vector.broadcast %[[EXTRACTED1]] : i32 to vector<i32> 802// TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED1]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32> 803// TARGET-RANK-ZERO: %[[EXTRACTED2:.*]] = vector.extract {{.*}} : i32 from vector<2xi32> 804// TARGET-RANK-ZERO: %[[BROADCASTED2:.*]] = vector.broadcast %[[EXTRACTED2]] : i32 to vector<i32> 805// TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED2]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32> 806 807// ----- 808 809func.func @scalable_transpose_store_unmasked(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 810 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32> 811 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32> 812 return 813} 814// FULL-UNROLL: #[[$SLICE_MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)> 815// FULL-UNROLL-LABEL: func.func @scalable_transpose_store_unmasked( 816// FULL-UNROLL-SAME: %[[VEC:.*]]: vector<4x[4]xf32>, 817// FULL-UNROLL-SAME: %[[DEST:.*]]: memref<?x?xf32>, 818// FULL-UNROLL-SAME: %[[I:.*]]: index, 819// FULL-UNROLL-SAME: %[[J:.*]]: index) 820// FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index 821// FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index 822// FULL-UNROLL-DAG: %[[C4:.*]] = arith.constant 4 : index 823// FULL-UNROLL: %[[SLICE_0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<4x[4]xf32> 824// FULL-UNROLL: %[[SLICE_1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<4x[4]xf32> 825// FULL-UNROLL: %[[SLICE_2:.*]] = vector.extract %[[VEC]][2] : vector<[4]xf32> from vector<4x[4]xf32> 826// FULL-UNROLL: %[[SLICE_3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<4x[4]xf32> 827// FULL-UNROLL: %[[VSCALE:.*]] = vector.vscale 828// FULL-UNROLL: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index 829// FULL-UNROLL: scf.for %[[VAL_13:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] { 830// FULL-UNROLL: %[[SLICE_I:.*]] = affine.apply #[[$SLICE_MAP]](%[[VAL_13]]){{\[}}%[[I]]] 831// FULL-UNROLL: %[[ELEM_0:.*]] = vector.extract %[[SLICE_0]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32> 832// FULL-UNROLL: %[[ELEM_1:.*]] = vector.extract %[[SLICE_1]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32> 833// FULL-UNROLL: %[[ELEM_2:.*]] = vector.extract %[[SLICE_2]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32> 834// FULL-UNROLL: %[[ELEM_3:.*]] = vector.extract %[[SLICE_3]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32> 835// FULL-UNROLL: %[[TRANSPOSE_SLICE:.*]] = vector.from_elements %[[ELEM_0]], %[[ELEM_1]], %[[ELEM_2]], %[[ELEM_3]] : vector<4xf32> 836// FULL-UNROLL: vector.transfer_write %[[TRANSPOSE_SLICE]], %[[DEST]]{{\[}}%[[SLICE_I]], %[[J]]] {in_bounds = [true]} : vector<4xf32>, memref<?x?xf32> 837 838// ----- 839 840func.func @scalable_transpose_store_dynamic_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index, %a: index, %b: index) { 841 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32> 842 %mask = vector.create_mask %a, %b : vector<[4]x4xi1> 843 vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32> 844 return 845} 846// FULL-UNROLL-LABEL: func.func @scalable_transpose_store_dynamic_mask( 847// FULL-UNROLL-SAME: %{{.*}}, %[[A:.*]]: index, %[[B:.*]]: index) 848// FULL-UNROLL: %[[SLICE_MASK:.*]] = vector.create_mask %[[B]] : vector<4xi1> 849// FULL-UNROLL: scf.for %{{.*}} to %[[A]] 850// FULL-UNROLL: vector.transfer_write {{.*}}, %[[SLICE_MASK]] 851 852// ----- 853 854func.func @scalable_transpose_store_constant_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 855 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32> 856 %mask = vector.constant_mask [4, 3] : vector<[4]x4xi1> 857 vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32> 858 return 859} 860// FULL-UNROLL-LABEL: func.func @scalable_transpose_store_constant_mask 861// FULL-UNROLL: %[[C3:.*]] = arith.constant 3 : index 862// FULL-UNROLL: %[[C4:.*]] = arith.constant 4 : index 863// FULL-UNROLL: %[[VSCALE:.*]] = vector.vscale 864// FULL-UNROLL: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index 865// FULL-UNROLL: %[[SLICE_MASK:.*]] = vector.create_mask %[[C3]] : vector<4xi1> 866// FULL-UNROLL: scf.for %{{.*}} to %[[C4_VSCALE]] 867// FULL-UNROLL: vector.transfer_write {{.*}}, %[[SLICE_MASK]] 868 869// ----- 870 871/// Unsupported transpose. 872func.func @negative_scalable_transpose_store_0(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 873 %transpose = vector.transpose %vec, [1, 0] : vector<[4]x4xf32> to vector<4x[4]xf32> 874 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<4x[4]xf32>, memref<?x?xf32> 875 return 876} 877// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_0 878// FULL-UNROLL-NOT: scf.for 879 880// ----- 881 882/// Non-identity permutation map (should be lowered first). 883func.func @negative_scalable_transpose_store_1(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 884 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32> 885 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true], permutation_map = affine_map<(d0,d1) -> (d1, d0)> } : vector<[4]x4xf32>, memref<?x?xf32> 886 return 887} 888// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_1 889// FULL-UNROLL-NOT: scf.for 890 891 892// ----- 893 894/// Out-of-bounds dim. 895func.func @negative_scalable_transpose_store_2(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 896 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32> 897 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [false, true]} : vector<[4]x4xf32>, memref<?x?xf32> 898 return 899} 900// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_2 901// FULL-UNROLL-NOT: scf.for 902 903// ----- 904 905/// Source not a vector.transpose. 906func.func @negative_scalable_transpose_store_3(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) { 907 vector.transfer_write %vec, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32> 908 return 909} 910// FULL-UNROLL-LABEL: @negative_scalable_transpose_store_3 911// FULL-UNROLL-NOT: scf.for 912