1// RUN: mlir-opt -split-input-file \ 2// RUN: -transform-preload-library='transform-library-paths=%p/td/vectorize-with-patterns.mlir' \ 3// RUN: -transform-interpreter=entry-point=vectorize_with_patterns %s | FileCheck %s 4 5//===----------------------------------------------------------------------===// 6// Contiguous load 7//===----------------------------------------------------------------------===// 8 9#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 10func.func @vectorize_nd_tensor_extract_transfer_read_basic( 11 %arg0: tensor<3x3x3xf32>, 12 %arg1: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> { 13 14 %res = linalg.generic { 15 indexing_maps = [#map], 16 iterator_types = ["parallel", "parallel", "parallel"] 17 } outs(%arg1 : tensor<1x1x3xf32>) { 18 ^bb0(%out: f32): 19 %1 = linalg.index 0 : index 20 %2 = linalg.index 1 : index 21 %3 = linalg.index 2 : index 22 %4 = tensor.extract %arg0[%1, %2, %3] : tensor<3x3x3xf32> 23 linalg.yield %4 : f32 24 } -> tensor<1x1x3xf32> 25 26 return %res : tensor<1x1x3xf32> 27} 28 29// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic 30// CHECK-SAME: %[[ARG0:.*]]: tensor<3x3x3xf32> 31// CHECK-SAME: %[[ARG1:.*]]: tensor<1x1x3xf32> 32 33// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index 34// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 35// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<0> : vector<1xindex> 36// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<[0, 1, 2]> : vector<3xindex> 37 38// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<1xindex> 39// CHECK-DAG: %[[IDX2:.+]] = vector.extract %[[CST_0]][0] : index from vector<1xindex> 40// CHECK-DAG: %[[IDX3:.+]] = vector.extract %[[CST_1]][0] : index from vector<3xindex> 41 42// CHECK: %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[IDX3]]], %[[CST]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32> 43// CHECK: vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32> 44 45 // ----- 46 47func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16xf32>, %arg0: index, %arg2: index, %arg1: index, %arg4: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { 48 %c79 = arith.constant 79 : index 49 %25 = linalg.generic { 50 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], 51 iterator_types = ["parallel", "parallel"] 52 } outs(%extracted_slice : tensor<1x4xf32>) { 53 ^bb0(%out: f32): 54 %26 = linalg.index 0 : index 55 %27 = arith.addi %arg0, %26 : index 56 %28 = arith.addi %27, %arg2 : index 57 %29 = linalg.index 1 : index 58 %30 = arith.addi %arg1, %29 : index 59 %31 = arith.addi %30, %arg4 : index 60 %extracted = tensor.extract %6[%28, %c79, %31] : tensor<45x80x16xf32> 61 linalg.yield %extracted : f32 62 } -> tensor<1x4xf32> 63 return %25 : tensor<1x4xf32> 64} 65 66 67// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_complex( 68// CHECK-SAME: %[[VAL_0:.*]]: tensor<45x80x16xf32>, 69// CHECK-SAME: %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index, 70// CHECK-SAME: %[[VAL_5:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { 71// CHECK-DAG: %[[VAL_6:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> 72// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32 73// CHECK-DAG: %[[VAL_9:.*]] = arith.constant 0 : index 74// CHECK-DAG: %[[VAL_10:.*]] = arith.constant 79 : index 75// CHECK: %[[VAL_11:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : index 76// CHECK: %[[VAL_13:.*]] = vector.broadcast %[[VAL_3]] : index to vector<4xindex> 77// CHECK: %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : vector<4xindex> 78// CHECK: %[[VAL_15:.*]] = vector.broadcast %[[VAL_4]] : index to vector<4xindex> 79// CHECK: %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_15]] : vector<4xindex> 80 81// CHECK: %[[VAL_19:.*]] = vector.extract %[[VAL_16]][0] : index from vector<4xindex> 82 83// CHECK: %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_11]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true]} : tensor<45x80x16xf32>, vector<1x4xf32> 84// CHECK: %[[VAL_21:.*]] = vector.transfer_write %[[VAL_20]], %[[VAL_5]]{{\[}}%[[VAL_9]], %[[VAL_9]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> 85// CHECK: return %[[VAL_21]] : tensor<1x4xf32> 86// CHECK: } 87 88// ----- 89 90// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load. 91func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { 92 %c79 = arith.constant 79 : index 93 %1 = linalg.generic { 94 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], 95 iterator_types = ["parallel", "parallel"] 96 } outs(%extracted_slice : tensor<1x4xf32>) { 97 ^bb0(%out: f32): 98 %2 = linalg.index 1 : index 99 %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) 100 %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32> 101 linalg.yield %extracted : f32 102 } -> tensor<1x4xf32> 103 return %1 : tensor<1x4xf32> 104} 105 106// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous( 107// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, 108// CHECK-SAME: %[[VAL_1:.*]]: index, 109// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { 110// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> 111// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 112// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index 113// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 79 : index 114// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> 115// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> 116// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex> 117// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> 118// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> 119// CHECK: return %[[VAL_12]] : tensor<1x4xf32> 120// CHECK: } 121 122// ----- 123 124func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> { 125 %c0 = arith.constant 0 : index 126 %c256 = arith.constant 256 : index 127 %output = tensor.empty() : tensor<1x1x4xf32> 128 %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) { 129 ^bb0(%out: f32): 130 %13 = linalg.index 0 : index 131 %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2) 132 %15 = linalg.index 2 : index 133 %16 = linalg.index 1 : index 134 %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3) 135 %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32> 136 %18 = arith.index_cast %extracted_0 : i32 to index 137 %19 = arith.maxsi %18, %c0 : index 138 %20 = arith.minsi %19, %c256 : index 139 %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32> 140 linalg.yield %extracted_1 : f32 141 } -> tensor<1x1x4xf32> 142 return %1 : tensor<1x1x4xf32> 143} 144 145// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_tensor_extract( 146// CHECK-SAME: %[[INPUT_1:.*]]: tensor<1x20xi32>, 147// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>, 148// CHECK-SAME: %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index, 149// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index 150// CHECK: %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index 151// First `vector.transfer_read` from the generic Op - loop invariant scalar load. 152// CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]] 153// CHECK-SAME: tensor<1x20xi32>, vector<i32> 154// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used 155// for address calculation also satisfy the required conditions). 156// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32> 157 158// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load. 159func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { 160 %c16 = arith.constant 16 : index 161 %1 = linalg.generic { 162 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], 163 iterator_types = ["parallel", "parallel"] 164 } outs(%extracted_slice : tensor<1x4xf32>) { 165 ^bb0(%out: f32): 166 %2 = linalg.index 0 : index 167 %3 = linalg.index 1 : index 168 %4 = arith.maxsi %2, %c16 : index 169 %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32> 170 linalg.yield %extracted : f32 171 } -> tensor<1x4xf32> 172 return %1 : tensor<1x4xf32> 173} 174 175// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous( 176// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, 177// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { 178// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index 179// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 180 181// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> 182// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex> 183// CHECK-DAG: %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex> 184// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex> 185 186// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> 187// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> 188// CHECK: return %[[VAL_9]] : tensor<1x4xf32> 189// CHECK: } 190 191// ----- 192 193//===----------------------------------------------------------------------===// 194// Gather load 195//===----------------------------------------------------------------------===// 196 197#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)> 198#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> 199func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> { 200 %1 = linalg.generic { 201 indexing_maps = [#map0, #map1], 202 iterator_types = ["parallel", "parallel", "parallel", "parallel"] 203 } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) { 204 ^bb0(%arg3: i32, %arg4: f32): 205 %2 = arith.index_cast %arg3 : i32 to index 206 %3 = tensor.extract %arg0[%2] : tensor<3xf32> 207 linalg.yield %3 : f32 208 } -> tensor<4x7x3x2xf32> 209 return %1 : tensor<4x7x3x2xf32> 210} 211// CHECK-LABEL: func.func @vectorize_1d_tensor_extract 212// CHECK-SAME: %[[ARG0:.*]]: tensor<3xf32> 213// CHECK-SAME: %[[ARG1:.*]]: tensor<4x3xi32> 214// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 215// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1> 216// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32> 217// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]] 218// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]] 219// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]] 220// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]] 221// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]] 222// CHECK: vector.transfer_write %[[GATHER]] 223 224// ----- 225 226#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)> 227#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> 228#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> 229func.func @vectorize_nd_tensor_extract_index_from_tensor(%arg0: tensor<3x3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x3xi32>, %arg3: tensor<4x7x2xf32>, %arg4: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> { 230 %2 = linalg.generic { 231 indexing_maps = [#map0, #map0, #map1, #map2], 232 iterator_types = ["parallel", "parallel", "parallel", "parallel"] 233 } ins(%arg1, %arg2, %arg3 : tensor<4x3xi32>, tensor<4x3xi32>, tensor<4x7x2xf32>) outs(%arg4 : tensor<4x7x3x2xf32>) { 234 ^bb0(%arg5: i32, %arg6: i32, %arg7: f32, %arg8: f32): 235 %3 = arith.index_cast %arg5 : i32 to index 236 %4 = arith.index_cast %arg6 : i32 to index 237 %7 = tensor.extract %arg0[%3, %4] : tensor<3x3xf32> 238 linalg.yield %7 : f32 239 } -> tensor<4x7x3x2xf32> 240 return %2 : tensor<4x7x3x2xf32> 241} 242// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_index_from_tensor 243// CHECK-SAME: %[[ARG0:.*]]: tensor<3x3xf32> 244// CHECK-SAME: %[[ARG1:arg1]]: tensor<4x3xi32> 245// CHECK-SAME: %[[ARG2:arg2]]: tensor<4x3xi32> 246// CHECK-SAME: %[[ARG3:.*]]: tensor<4x7x2xf32> 247// CHECK-SAME: %[[ARG4:.*]]: tensor<4x7x3x2xf32> 248// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 249// CHECK-DAG: %[[C0_i32:.*]] = arith.constant 0 : i32 250// CHECK-DAG: %[[CST:.*]] = arith.constant dense<3> : vector<7x2x4x3xindex> 251// CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1> 252// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32> 253// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32> 254// CHECK: %[[V1:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32> 255// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]] : vector<4x3xi32> to vector<4x3xindex> 256// CHECK: %[[B1:.*]] = vector.broadcast %[[CAST]] : vector<4x3xindex> to vector<7x2x4x3xindex> 257// CHECK: %[[CAST_1:.*]] = arith.index_cast %[[V1]] : vector<4x3xi32> to vector<4x3xindex> 258// CHECK: %[[B2:.*]] = vector.broadcast %[[CAST_1]] : vector<4x3xindex> to vector<7x2x4x3xindex> 259// CHECK: %[[MULI:.*]] = arith.muli %[[B1]], %[[CST]] : vector<7x2x4x3xindex> 260// CHECK: %[[ADDI:.*]] = arith.addi %[[B2]], %[[MULI]] : vector<7x2x4x3xindex> 261// CHECK: %[[T:.*]] = vector.transpose %[[ADDI]], [2, 0, 3, 1] : vector<7x2x4x3xindex> to vector<4x7x3x2xindex> 262// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]]] [%[[T]]], %[[CST_1]], %[[PASSTHRU]] : tensor<3x3xf32>, vector<4x7x3x2xindex>, vector<4x7x3x2xi1>, vector<4x7x3x2xf32> into vector<4x7x3x2xf32> 263// CHECK: vector.transfer_write %[[GATHER]], %[[ARG4]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true, true]} : vector<4x7x3x2xf32>, tensor<4x7x3x2xf32> 264 265// ----- 266 267#map = affine_map<(d0, d1) -> (d0, d1)> 268#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)> 269func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load(%arg0: tensor<8x128x768xf32>, %arg1 : index) -> tensor<8x1xf32> { 270 %c0 = arith.constant 0 : index 271 %0 = tensor.empty() : tensor<8x1xf32> 272 %1 = linalg.generic { 273 indexing_maps = [#map], 274 iterator_types = ["parallel", "parallel"] 275 } outs(%0 : tensor<8x1xf32>) { 276 ^bb0(%arg5: f32): 277 %2 = linalg.index 0 : index 278 %3 = linalg.index 1 : index 279 %4 = affine.apply #map1(%arg1, %3, %arg1) 280 %extracted = tensor.extract %arg0[%2, %c0, %4] : tensor<8x128x768xf32> 281 linalg.yield %extracted : f32 282 } -> tensor<8x1xf32> 283 return %1 : tensor<8x1xf32> 284} 285 286// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load 287// CHECK-SAME: %[[ARG0:.*]]: tensor<8x128x768xf32> 288// CHECK-SAME: %[[ARG1:.*]]: index 289// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 290// CHECK-DAG: %[[CST:.*]] = arith.constant dense<768> : vector<1x8xindex> 291// CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<128> : vector<1x8xindex> 292// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32> 293// CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<true> : vector<8x1xi1> 294// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> 295// CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<8x1xf32> 296// CHECK: %[[B1:.*]] = vector.broadcast %[[CST_3]] : vector<8xindex> to vector<1x8xindex> 297// CHECK: %[[ADDI_ARG1:.*]] = arith.addi %[[ARG1]], %[[ARG1]] : index 298// CHECK: %[[B2:.*]] = vector.broadcast %[[ADDI_ARG1]] : index to vector<1xindex> 299// CHECK: %[[MULI_1:.*]] = arith.muli %[[B1]], %[[CST_0]] : vector<1x8xindex> 300// CHECK: %[[MULI_2:.*]] = arith.muli %[[MULI_1]], %[[CST]] : vector<1x8xindex> 301// CHECK: %[[T:.*]] = vector.transpose %[[MULI_2]], [1, 0] : vector<1x8xindex> to vector<8x1xindex> 302// CHECK: %[[B3:.*]] = vector.broadcast %[[B2]] : vector<1xindex> to vector<8x1xindex> 303// CHECK: %[[ADDI:.*]] = arith.addi %[[B3]], %[[T]] : vector<8x1xindex> 304// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[ADDI]]], %[[CST_2]], %[[PASSTHRU]] : tensor<8x128x768xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32> 305// CHECK: vector.transfer_write %[[GATHER]], %[[EMPTY]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32> 306 307// ----- 308 309// Reading a 1D column vector (hence a candidate for a contiguous load), but given 310// %1, it's a gather load. 311 312#map = affine_map<(d0, d1) -> (d0, d1)> 313func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> { 314 %c0 = arith.constant 0 : index 315 %0 = tensor.empty() : tensor<8x1xf32> 316 %res = linalg.generic { 317 indexing_maps = [#map], 318 iterator_types = ["parallel", "parallel"] 319 } outs(%0 : tensor<8x1xf32>) { 320 ^bb0(%arg1: f32): 321 %1 = linalg.index 0 : index 322 %extracted = tensor.extract %src[%1, %c0] : tensor<8x128xf32> 323 linalg.yield %extracted : f32 324 } -> tensor<8x1xf32> 325 return %res : tensor<8x1xf32> 326} 327 328// CHECK-LABEL: func.func @index_from_output_column_vector_gather_load( 329// CHECK-SAME: %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> { 330// CHECK: %[[C128:.*]] = arith.constant dense<128> : vector<1x8xindex> 331// CHECK: %[[C0:.*]] = arith.constant 0 : index 332// CHECK: %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32> 333// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1> 334// CHECK: %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> 335// CHECK: %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32> 336// CHECK: %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex> 337// CHECK: %[[MUL:.*]] = arith.muli %[[B]], %[[C128]] : vector<1x8xindex> 338// CHECK: %[[TR:.*]] = vector.transpose %[[MUL]], [1, 0] : vector<1x8xindex> to vector<8x1xindex> 339// CHECK: %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32> 340// CHECK: %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32> 341// CHECK: return %[[RES]] : tensor<8x1xf32> 342 343// ----- 344 345// Same as above, but the access indices have been swapped and hence this _is_ 346// a contiguous load. Currently not supported and lowered as vector.gather 347// instead. 348// TODO: Make sure that this is lowered as a contiguous load. 349 350#map = affine_map<(d0, d1) -> (d0, d1)> 351func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> { 352 %c0 = arith.constant 0 : index 353 %0 = tensor.empty() : tensor<8x1xf32> 354 %res = linalg.generic { 355 indexing_maps = [#map], 356 iterator_types = ["parallel", "parallel"] 357 } outs(%0 : tensor<8x1xf32>) { 358 ^bb0(%arg1: f32): 359 %1 = linalg.index 0 : index 360 %extracted = tensor.extract %src[%c0, %1] : tensor<8x128xf32> 361 linalg.yield %extracted : f32 362 } -> tensor<8x1xf32> 363 return %res : tensor<8x1xf32> 364} 365 366// CHECK-LABEL: func.func @index_from_output_column_vector_contiguous_load( 367// CHECK-SAME: %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> { 368// CHECK: %[[C0:.*]] = arith.constant 0 : index 369// CHECK: %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32> 370// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1> 371// CHECK: %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> 372// CHECK: %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32> 373// CHECK: %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex> 374// CHECK: %[[TR:.*]] = vector.transpose %[[B]], [1, 0] : vector<1x8xindex> to vector<8x1xindex> 375// CHECK: %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32> 376// CHECK: %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32> 377// CHECK: return %[[RES]] : tensor<8x1xf32> 378 379// ----- 380 381#map = affine_map<(d0) -> (d0)> 382func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32>, %arg1: tensor<5xi32>) -> tensor<5xf32> { 383 %c5 = arith.constant 5 : index 384 %c0 = arith.constant 0 : index 385 %0 = tensor.empty() : tensor<5xf32> 386 %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<5xf32>) { 387 ^bb0(%out: f32): 388 %2 = linalg.index 0 : index 389 %extracted = tensor.extract %arg1[%2] : tensor<5xi32> 390 %3 = arith.index_cast %extracted : i32 to index 391 %4 = arith.maxsi %3, %c0 : index 392 %5 = arith.minsi %4, %c5 : index 393 %extracted_0 = tensor.extract %arg0[%5] : tensor<6xf32> 394 linalg.yield %extracted_0 : f32 395 } -> tensor<5xf32> 396 return %1 : tensor<5xf32> 397} 398 399// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_contiguous_and_gather( 400// CHECK-SAME: %[[VAL_0:.*]]: tensor<6xf32> 401// CHECK-SAME: %[[VAL_1:.*]]: tensor<5xi32> 402// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 0 : index 403// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i32 404// CHECK-DAG: %[[VAL_4:.*]] = arith.constant dense<0> : vector<5xindex> 405// CHECK-DAG: %[[VAL_5:.*]] = arith.constant dense<5> : vector<5xindex> 406// CHECK-DAG: %[[VAL_6:.*]] = arith.constant dense<true> : vector<5xi1> 407// CHECK-DAG: %[[VAL_7:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32> 408// CHECK: %[[VAL_8:.*]] = tensor.empty() : tensor<5xf32> 409// CHECK: %[[VAL_9:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%{{.*}}], %[[VAL_3]] {in_bounds = [true]} : tensor<5xi32>, vector<5xi32> 410// CHECK: %[[VAL_10:.*]] = arith.index_cast %[[VAL_9]] : vector<5xi32> to vector<5xindex> 411// CHECK: %[[VAL_11:.*]] = arith.maxsi %[[VAL_10]], %[[VAL_4]] : vector<5xindex> 412// CHECK: %[[VAL_12:.*]] = arith.minsi %[[VAL_11]], %[[VAL_5]] : vector<5xindex> 413// CHECK: %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]]] {{\[}}%[[VAL_12]]], %[[VAL_6]], %[[VAL_7]] : tensor<6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32> 414// CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32> 415// CHECK: return %[[VAL_14]] : tensor<5xf32> 416 417// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load. 418func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { 419 %c16 = arith.constant 16 : index 420 %1 = linalg.generic { 421 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], 422 iterator_types = ["parallel", "parallel"] 423 } outs(%extracted_slice : tensor<1x4xf32>) { 424 ^bb0(%out: f32): 425 %2 = linalg.index 1 : index 426 %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) 427 %extracted = tensor.extract %6[%3, %c16] : tensor<80x16xf32> 428 linalg.yield %extracted : f32 429 } -> tensor<1x4xf32> 430 return %1 : tensor<1x4xf32> 431} 432 433// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_gather( 434// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, 435// CHECK-SAME: %[[VAL_1:.*]]: index, 436// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { 437// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> 438// CHECK-DAG: %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1> 439// CHECK-DAG: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32> 440// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index 441// CHECK-DAG: %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex> 442// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> 443// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> 444// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex> 445// CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_10]], %[[VAL_7]] : vector<1x4xindex> 446// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_7]] : vector<1x4xindex> 447// CHECK: %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_12]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32> 448// CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> 449// CHECK: return %[[VAL_14]] : tensor<1x4xf32> 450// CHECK: } 451 452// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load. 453func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { 454 %c79 = arith.constant 79 : index 455 %1 = linalg.generic { 456 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], 457 iterator_types = ["parallel", "parallel"] 458 } outs(%extracted_slice : tensor<1x4xf32>) { 459 ^bb0(%out: f32): 460 %2 = linalg.index 1 : index 461 %3 = arith.maxsi %2, %c79 : index 462 %extracted = tensor.extract %arg0[%3, %2] : tensor<80x16xf32> 463 linalg.yield %extracted : f32 464 } -> tensor<1x4xf32> 465 return %1 : tensor<1x4xf32> 466} 467 468// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_gather( 469// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, 470// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { 471// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> 472// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex> 473// CHECK-DAG: %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1> 474// CHECK-DAG: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32> 475// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index 476// CHECK: %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex> 477// CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex> 478// CHECK: %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32> 479// CHECK: %[[VAL_10:.*]] = vector.transfer_write %[[VAL_9]], %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> 480// CHECK: return %[[VAL_10]] : tensor<1x4xf32> 481// CHECK: } 482 483// ----- 484 485// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index. 486#map = affine_map<(d0) -> (d0)> 487func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> { 488 %0 = tensor.empty() : tensor<5xf32> 489 %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg1: tensor<5xindex>) outs(%0 : tensor<5xf32>) { 490 ^bb0(%in: index, %out: f32): 491 %2 = linalg.index 0 : index 492 %extracted_0 = tensor.extract %arg0[%in, %2] : tensor<5x6xf32> 493 linalg.yield %extracted_0 : f32 494 } -> tensor<5xf32> 495 return %1 : tensor<5xf32> 496} 497 498// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_block_arg( 499// CHECK-SAME: %[[VAL_0:.*]]: tensor<5x6xf32>, 500// CHECK-SAME: %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> { 501// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 0 : index 502// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex> 503// CHECK-DAG: %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1> 504// CHECK-DAG: %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32> 505// CHECK-DAG: %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex> 506// CHECK: %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32> 507// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex> 508// CHECK: %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex> 509// CHECK: %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_3]] : vector<5xindex> 510// CHECK: %[[VAL_11:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]]] {{\[}}%[[VAL_10]]], %[[VAL_4]], %[[VAL_5]] : tensor<5x6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32> 511// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_7]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32> 512// CHECK: return %[[VAL_12]] : tensor<5xf32> 513// CHECK: } 514 515// ----- 516 517#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 518#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)> 519func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> { 520 %c1 = arith.constant 1 : index 521 %c0 = arith.constant 0 : index 522 %c2 = arith.constant 2 : index 523 %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) { 524 ^bb0(%out: f32): 525 %1 = linalg.index 1 : index 526 %2 = linalg.index 0 : index 527 %3 = affine.apply #map1(%1, %2, %arg2) 528 %4 = linalg.index 2 : index 529 %5 = arith.subi %c2, %4 : index 530 %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32> 531 linalg.yield %extracted : f32 532 } -> tensor<1x1x3xf32> 533 return %0 : tensor<1x1x3xf32> 534} 535// CHECK-LABEL: func.func @vectorize_reverse_like_tensor_extract 536// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]] 537// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]] 538// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]] 539// CHECK-DAG: %[[CST:.+]] = arith.constant dense<3> : vector<1x1x3xindex> 540// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index 541// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1x1x3xi1> 542// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x3xf32> 543// CHECK-DAG: %[[INIT_IDX:.+]] = arith.constant dense<[2, 1, 0]> : vector<3xindex> 544// CHECK: %[[T0:.+]] = vector.broadcast %[[ARG2]] : index to vector<1x1x3xindex> 545// CHECK: %[[T1:.+]] = arith.muli %[[T0]], %[[CST]] : vector<1x1x3xindex> 546// CHECK: %[[T2:.+]] = vector.broadcast %[[INIT_IDX]] 547// CHECK: %[[T3:.+]] = arith.addi %[[T2]], %[[T1]] 548// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]] 549// CHECK: vector.transfer_write %[[GATHER]] 550 551//===----------------------------------------------------------------------===// 552// Scalar load + broadcast 553//===----------------------------------------------------------------------===// 554 555// ----- 556 557#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 558func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> { 559 %c0 = arith.constant 1 : index 560 %c1 = arith.constant 2 : index 561 562 %res = linalg.generic { 563 indexing_maps = [#map], 564 iterator_types = ["parallel", "parallel", "parallel"] 565 } outs(%init : tensor<1x1x3xf32>) { 566 ^bb0(%arg4: f32): 567 %1 = tensor.extract %src[%c0, %c1] : tensor<3x3xf32> 568 linalg.yield %1 : f32 569 } -> tensor<1x1x3xf32> 570 571 return %res : tensor<1x1x3xf32> 572} 573 574// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_scalar_broadcast( 575// CHECK-SAME: %[[SRC:.*]]: tensor<3x3xf32>, 576// CHECK-SAME: %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> { 577// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 578// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index 579// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index 580// CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 581// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32> 582// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32> 583// CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32> 584 585// ----- 586 587#map = affine_map<() -> ()> 588func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>) -> tensor<f32> { 589 %res = linalg.generic { 590 indexing_maps = [#map], 591 iterator_types = [] 592 } outs(%init : tensor<f32>) { 593 ^bb0(%in: f32): 594 %1 = tensor.extract %src[] : tensor<f32> 595 linalg.yield %1 : f32 596 } -> tensor<f32> 597 598 return %res : tensor<f32> 599} 600 601// CHECK-LABEL: func.func @extract_scalar_from_0d_into_0d( 602// CHECK-SAME: %[[SRC:.*]]: tensor<f32>, 603// CHECK-SAME: %[[INIT:.*]]: tensor<f32>) -> tensor<f32> { 604// CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 605// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32> 606// CHECK: vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32> 607 608// ----- 609 610#map = affine_map<(n) -> (n)> 611func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32>) -> tensor<1xf32> { 612 %res = linalg.generic { 613 indexing_maps = [#map], 614 iterator_types = ["parallel"] 615 } outs(%init : tensor<1xf32>) { 616 ^bb0(%in: f32): 617 %1 = tensor.extract %src[] : tensor<f32> 618 linalg.yield %1 : f32 619 } -> tensor<1xf32> 620 621 return %res : tensor<1xf32> 622} 623// CHECK-LABEL: func.func @extract_scalar_from_0d_into_1d( 624// CHECK-SAME: %[[SRC:.*]]: tensor<f32>, 625// CHECK-SAME: %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> { 626// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 627// CHECK-DAG: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 628// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32> 629// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32> 630// CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32> 631 632// ----- 633 634#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 635func.func @vectorize_0d_tensor_extract(%src: tensor<f32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> { 636 %res = linalg.generic { 637 indexing_maps = [#map1], 638 iterator_types = ["parallel", "parallel", "parallel"] 639 } outs(%init : tensor<1x1x3xf32>) { 640 ^bb0(%arg4: f32): 641 %1 = tensor.extract %src[] : tensor<f32> 642 linalg.yield %1 : f32 643 } -> tensor<1x1x3xf32> 644 return %res : tensor<1x1x3xf32> 645} 646 647// CHECK-LABEL: func.func @vectorize_0d_tensor_extract( 648// CHECK-SAME: %[[SRC:.*]]: tensor<f32> 649// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][], %{{.+}} : tensor<f32> 650// CHECK: vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32> 651 652// ----- 653 654func.func @scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> { 655 %c4 = arith.constant 4 : index 656 %c0 = arith.constant 0 : index 657 %src = arith.constant dense<[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32> 658 659 %res = linalg.generic { 660 indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], 661 iterator_types = ["parallel", "parallel", "parallel"] 662 } outs(%init : tensor<1x1x4xi32>) { 663 664 ^bb0(%out: i32): 665 %idx = linalg.index 0 : index 666 %extracted = tensor.extract %src[%idx, %c0] : tensor<15x1xi32> 667 linalg.yield %extracted : i32 668 } -> tensor<1x1x4xi32> 669 670 return %res : tensor<1x1x4xi32> 671} 672 673// CHECK-LABEL: func.func @scalar_read_with_broadcast_from_column_tensor 674// CHECK-SAME: %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> { 675// CHECK-DAG: %[[PAD:.*]] = arith.constant 0 : i32 676// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 677// CHECK-DAG: %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32> 678// CHECK-DAG: %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex> 679// CHECK: %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex> 680// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32> 681// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32> 682// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32> 683 684// ----- 685 686// TODO: Currently this fails to vectorise when the indices are non-constant. 687 688#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 689func.func @vectorize_nd_tensor_extract_transfer_read_basic_column( 690 %src: tensor<3x3x3xf32>, 691 %init: tensor<3x1x1xf32>) -> tensor<3x1x1xf32> { 692 693 %c0 = arith.constant 0 : index 694 695 %res = linalg.generic { 696 indexing_maps = [#map], 697 iterator_types = ["parallel", "parallel", "parallel"] 698 } outs(%init : tensor<3x1x1xf32>) { 699 ^bb0(%out: f32): 700 %1 = tensor.extract %src[%c0, %c0, %c0] : tensor<3x3x3xf32> 701 linalg.yield %1 : f32 702 } -> tensor<3x1x1xf32> 703 704 return %res : tensor<3x1x1xf32> 705} 706 707// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic_column( 708// CHECK-SAME: %[[SRC:.*]]: tensor<3x3x3xf32>, 709// CHECK-SAME: %[[INIT:.*]]: tensor<3x1x1xf32>) 710// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index 711// CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32 712// CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32> 713// CHECK: %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32> 714// CHECK: vector.transfer_write %[[READ_BCAST]], %[[INIT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32> 715