1// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s 2 3func.func @vectorize_dynamic_identity(%arg0: tensor<?xf32>, 4 %arg1: tensor<?xf32>, 5 %arg2: tensor<?xf32>) -> tensor<?xf32> { 6 %0 = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>, 7 affine_map<(d0) -> (d0)>, 8 affine_map<(d0) -> (d0)>], 9 iterator_types = ["parallel"] } 10 ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>) 11 outs(%arg2 : tensor<?xf32>) { 12 ^bb(%in0: f32, %in1: f32, %out: f32) : 13 %0 = arith.addf %in0, %in1 : f32 14 linalg.yield %0 : f32 15 } -> tensor<?xf32> 16 return %0 : tensor<?xf32> 17} 18 19// CHECK-LABEL: @vectorize_dynamic_identity 20// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index 21// CHECK: %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32> 22// CHECK: %[[VAL_7:.*]] = vector.create_mask %[[VAL_4]] : vector<[4]xi1> 23// CHECK: %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 24// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 25// CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 26// CHECK: %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<[4]xf32> 27// CHECK: %[[VAL_14:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %{{.*}} {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32> 28 29module attributes {transform.with_named_sequence} { 30 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 31 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 32 transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op 33 transform.yield 34 } 35} 36 37// ----- 38 39func.func @vectorize_partial_dynamic_identity(%arg0: tensor<8x?xf32>, 40 %arg1: tensor<8x?xf32>, 41 %arg2: tensor<8x?xf32>) -> tensor<8x?xf32> { 42 %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 43 affine_map<(d0, d1) -> (d0, d1)>, 44 affine_map<(d0, d1) -> (d0, d1)>], 45 iterator_types = ["parallel", "parallel"] } 46 ins(%arg0, %arg1 : tensor<8x?xf32>, tensor<8x?xf32>) 47 outs(%arg2 : tensor<8x?xf32>) { 48 ^bb(%in0: f32, %in1: f32, %out: f32) : 49 %0 = arith.addf %in0, %in1 : f32 50 linalg.yield %0 : f32 51 } -> tensor<8x?xf32> 52 return %0 : tensor<8x?xf32> 53} 54 55// CHECK-LABEL: func.func @vectorize_partial_dynamic_identity( 56// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x?xf32>, %[[VAL_1:.*]]: tensor<8x?xf32>, %[[VAL_2:.*]]: tensor<8x?xf32>) -> tensor<8x?xf32> { 57// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 1 : index 58// CHECK-DAG: %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<8x?xf32> 59// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index 60// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 61// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 8 : index 62// CHECK: %[[VAL_8:.*]] = vector.create_mask %[[VAL_7]], %[[VAL_4]] : vector<8x[32]xi1> 63// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_6]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 64// CHECK: %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32 65// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_1]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_10]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 66// CHECK: %[[VAL_12:.*]] = arith.constant 0.000000e+00 : f32 67// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_2]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_12]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 68// CHECK: %[[VAL_14:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] : vector<8x[32]xf32> 69// CHECK: %[[VAL_15:.*]] = arith.constant 0 : index 70// CHECK: %[[VAL_16:.*]] = vector.mask %[[VAL_8]] { vector.transfer_write %[[VAL_14]], %[[VAL_2]][%[[VAL_15]], %[[VAL_15]]] {in_bounds = [true, true]} : vector<8x[32]xf32>, tensor<8x?xf32> } : vector<8x[32]xi1> -> tensor<8x?xf32> 71 72 73module attributes {transform.with_named_sequence} { 74 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 75 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 76 transform.structured.vectorize %0 vector_sizes [8, [32]] : !transform.any_op 77 transform.yield 78 } 79} 80 81// ----- 82 83func.func @vectorize_static_shape_with_mask(%arg0: tensor<8x30xf32>, 84 %arg1: tensor<8x30xf32>, 85 %arg2: tensor<8x30xf32>) -> tensor<8x30xf32> { 86 %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 87 affine_map<(d0, d1) -> (d0, d1)>, 88 affine_map<(d0, d1) -> (d0, d1)>], 89 iterator_types = ["parallel", "parallel"] } 90 ins(%arg0, %arg1 : tensor<8x30xf32>, tensor<8x30xf32>) 91 outs(%arg2 : tensor<8x30xf32>) { 92 ^bb(%in0: f32, %in1: f32, %out: f32) : 93 %0 = arith.addf %in0, %in1 : f32 94 linalg.yield %0 : f32 95 } -> tensor<8x30xf32> 96 return %0 : tensor<8x30xf32> 97} 98 99// CHECK-LABEL: func.func @vectorize_static_shape_with_mask( 100// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x30xf32>, %[[VAL_1:.*]]: tensor<8x30xf32>, %[[VAL_2:.*]]: tensor<8x30xf32>) -> tensor<8x30xf32> { 101// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index 102// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 103// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 8 : index 104// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 30 : index 105// CHECK: %[[VAL_7:.*]] = vector.create_mask %[[VAL_5]], %[[VAL_6]] : vector<8x[32]xi1> 106// CHECK: %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_0]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 107// CHECK: %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32 108// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_1]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_9]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 109// CHECK: %[[VAL_11:.*]] = arith.constant 0.000000e+00 : f32 110// CHECK: %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_2]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_11]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x[32]xf32> } : vector<8x[32]xi1> -> vector<8x[32]xf32> 111// CHECK: %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<8x[32]xf32> 112// CHECK: %[[VAL_14:.*]] = arith.constant 0 : index 113// CHECK: %[[VAL_15:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %[[VAL_13]], %[[VAL_2]][%[[VAL_14]], %[[VAL_14]]] {in_bounds = [true, true]} : vector<8x[32]xf32>, tensor<8x30xf32> } : vector<8x[32]xi1> -> tensor<8x30xf32> 114 115module attributes {transform.with_named_sequence} { 116 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 117 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 118 transform.structured.vectorize %0 vector_sizes [8, [32]] : !transform.any_op 119 transform.yield 120 } 121} 122 123// ----- 124 125// NOTE: Often, non-trailing scalable sizes are problematic - there are no 126// "scalable" arrays of vectors at the LLVM level (multi-dim vectors are 127// decomposed into arrays of aggregates). However, the trailing dim in this 128// case is 1 and that can be folded away later. 129 130func.func @vectorize_dynamic_fill_leading_scalable(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> { 131 %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32> 132 return %0 : tensor<?x?xf32> 133} 134 135// CHECK-LABEL: func.func @vectorize_dynamic_fill_leading_scalable 136// CHECK: %[[DIM0:.*]] = tensor.dim 137// CHECK: %[[DIM1:.*]] = tensor.dim 138// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<[8]x1xi1> 139// CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<[8]x1xf32> 140// CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<[8]x1xf32>, tensor<?x?xf32> } : vector<[8]x1xi1> 141 142module attributes {transform.with_named_sequence} { 143 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 144 %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op 145 transform.structured.vectorize %0 vector_sizes [[8], 1] : !transform.any_op 146 transform.yield 147 } 148} 149 150// ----- 151 152#map = affine_map<(d0) -> (d0)> 153func.func @vectorize_linalg_index(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> { 154 %0 = linalg.generic { 155 indexing_maps = [#map], 156 iterator_types = ["parallel"] 157 } outs(%arg1 : tensor<?xf32>) { 158 ^bb0(%in: f32): 159 %1 = linalg.index 0 : index 160 %2 = tensor.extract %arg0[%1] : tensor<?xf32> 161 linalg.yield %2 : f32 162 } -> tensor<?xf32> 163 return %0 : tensor<?xf32> 164} 165 166// CHECK-LABEL: @vectorize_linalg_index 167// CHECK-SAME: %[[SRC:.*]]: tensor<?xf32>, %[[DST:.*]]: tensor<?xf32> 168// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 169// CHECK: %[[DST_DIM0:.*]] = tensor.dim %[[DST]], %[[C0]] : tensor<?xf32> 170// CHECK: %[[MASK:.*]] = vector.create_mask %[[DST_DIM0]] : vector<[4]xi1> 171// CHECK-DAG: %[[STEP:.+]] = vector.step : vector<[4]xindex> 172// CHECK-DAG: %[[STEP_ELEMENT:.+]] = vector.extract %[[STEP]][0] : index from vector<[4]xindex> 173 174// CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]][%[[STEP_ELEMENT]]], %cst {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 175// CHECK: %[[OUT:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[DST]]{{\[}}%[[C0]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32> 176// CHECK: return %[[OUT]] : tensor<?xf32> 177 178module attributes {transform.with_named_sequence} { 179 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 180 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 181 transform.structured.vectorize %0 vector_sizes [[4]] {vectorize_nd_extract} : !transform.any_op 182 183 %func = transform.structured.match ops{["func.func"]} in %arg1 184 : (!transform.any_op) -> !transform.any_op 185 transform.apply_patterns to %func { 186 transform.apply_patterns.linalg.tiling_canonicalization 187 } : !transform.any_op 188 transform.yield 189 } 190} 191 192// ----- 193 194func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>, 195 %arg1: tensor<f32>) -> tensor<f32> { 196 197 %0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0] 198 (%in: f32, %init: f32) { 199 %0 = arith.addf %in, %init : f32 200 linalg.yield %0 : f32 201 } 202 return %0 : tensor<f32> 203} 204 205// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d( 206// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> { 207// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 208// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?xf32> 209// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 210// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 211// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1> 212// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 213// CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32 214// CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor<f32>, vector<f32> 215// CHECK: %[[ACC_f32:.*]] = vector.extract %[[VEC_RD_1]][] : f32 from vector<f32> 216// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32 217// CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector<f32> 218// CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector<f32>, tensor<f32> 219 220module attributes {transform.with_named_sequence} { 221 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 222 %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op 223 transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op 224 transform.yield 225 } 226} 227 228// ----- 229 230// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir. 231func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>, 232 %arg1: tensor<?xf32>) -> tensor<?xf32> { 233 %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 234 affine_map<(d0, d1) -> (d0)>], 235 iterator_types = ["parallel", "reduction"] } 236 ins(%arg0 : tensor<?x?xf32>) 237 outs(%arg1 : tensor<?xf32>) { 238 ^bb(%in: f32, %out: f32) : 239 %0 = arith.addf %in, %out : f32 240 linalg.yield %0 : f32 241 } -> tensor<?xf32> 242 return %0 : tensor<?xf32> 243} 244 245// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d( 246// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> { 247// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 248// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32> 249// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index 250// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32> 251// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 252// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 253// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1> 254// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32> 255// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 256// CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1> 257// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32> 258// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32> 259// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 260// CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32> 261 262module attributes {transform.with_named_sequence} { 263 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 264 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 265 transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op 266 transform.yield 267 } 268} 269 270// ----- 271 272func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor<?x?xf32>, 273 %arg1: tensor<?xf32>, 274 %arg2: tensor<?xf32>) { 275 linalg.matvec ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>) 276 outs(%arg2 : tensor<?xf32>) -> tensor<?xf32> 277 return 278} 279 280// CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim( 281// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) { 282// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 283// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32> 284// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index 285// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32> 286// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 287// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 288// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1> 289// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32> 290// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 291// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1> 292// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32> 293// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 294// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1> 295// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32> 296// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32> 297// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32> 298// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 299// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32> 300 301module attributes {transform.with_named_sequence} { 302 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 303 %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op 304 transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op 305 transform.yield 306 } 307} 308 309// ----- 310 311func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor<?x?xf32>, 312 %arg1: tensor<?xf32>, 313 %arg2: tensor<?xf32>) -> tensor<?xf32> { 314 %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 315 affine_map<(d0, d1) -> (d1)>, 316 affine_map<(d0, d1) -> (d0)>], 317 iterator_types = ["parallel", "reduction"] } 318 ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?xf32>) 319 outs(%arg2 : tensor<?xf32>) { 320 ^bb(%mat: f32, %vec: f32, %res: f32) : 321 %0 = arith.mulf %mat, %vec : f32 322 %1 = arith.addf %res, %0 : f32 323 linalg.yield %1 : f32 324 } -> tensor<?xf32> 325 return %0 : tensor<?xf32> 326} 327 328// CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim( 329// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> tensor<?xf32> { 330// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 331// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor<?x?xf32> 332// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index 333// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor<?x?xf32> 334// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 335// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 336// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1> 337// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32> 338// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 339// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1> 340// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor<?xf32>, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32> 341// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 342// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1> 343// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> 344// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32> 345// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction <add>, %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32> 346// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index 347// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32> } : vector<[4]xi1> -> tensor<?xf32> 348 349module attributes {transform.with_named_sequence} { 350 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 351 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 352 transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op 353 transform.yield 354 } 355} 356