xref: /llvm-project/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir (revision b558c6b288c469959fbb2827bfbaba57a79932cb)
1// RUN: mlir-opt -split-input-file \
2// RUN: -transform-preload-library='transform-library-paths=%p/td/vectorize-with-patterns.mlir' \
3// RUN: -transform-interpreter=entry-point=vectorize_with_patterns %s | FileCheck %s
4
5//===----------------------------------------------------------------------===//
6// Contiguous load
7//===----------------------------------------------------------------------===//
8
9#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
10func.func @vectorize_nd_tensor_extract_transfer_read_basic(
11    %arg0: tensor<3x3x3xf32>,
12    %arg1: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
13
14  %res = linalg.generic {
15    indexing_maps = [#map],
16    iterator_types = ["parallel", "parallel", "parallel"]
17  } outs(%arg1 : tensor<1x1x3xf32>) {
18  ^bb0(%out: f32):
19    %1 = linalg.index 0 : index
20    %2 = linalg.index 1 : index
21    %3 = linalg.index 2 : index
22    %4 = tensor.extract %arg0[%1, %2, %3] : tensor<3x3x3xf32>
23    linalg.yield %4 : f32
24  } -> tensor<1x1x3xf32>
25
26  return %res : tensor<1x1x3xf32>
27}
28
29// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic
30// CHECK-SAME: %[[ARG0:.*]]: tensor<3x3x3xf32>
31// CHECK-SAME: %[[ARG1:.*]]: tensor<1x1x3xf32>
32
33// CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
34// CHECK-DAG:  %[[CST:.+]] = arith.constant 0.000000e+00 : f32
35// CHECK-DAG:  %[[CST_0:.+]] = arith.constant dense<0> : vector<1xindex>
36// CHECK-DAG:  %[[CST_1:.+]] = arith.constant dense<[0, 1, 2]> : vector<3xindex>
37
38// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<1xindex>
39// CHECK-DAG: %[[IDX2:.+]] = vector.extract %[[CST_0]][0] : index from vector<1xindex>
40// CHECK-DAG: %[[IDX3:.+]] = vector.extract %[[CST_1]][0] : index from vector<3xindex>
41
42// CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[IDX1]], %[[IDX2]], %[[IDX3]]], %[[CST]] {in_bounds = [true, true, true]} : tensor<3x3x3xf32>, vector<1x1x3xf32>
43// CHECK:   vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
44
45 // -----
46
47func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16xf32>, %arg0: index, %arg2: index, %arg1: index, %arg4: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
48  %c79 = arith.constant 79 : index
49  %25 = linalg.generic {
50    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
51    iterator_types = ["parallel", "parallel"]
52  } outs(%extracted_slice : tensor<1x4xf32>) {
53  ^bb0(%out: f32):
54    %26 = linalg.index 0 : index
55    %27 = arith.addi %arg0, %26 : index
56    %28 = arith.addi %27, %arg2 : index
57    %29 = linalg.index 1 : index
58    %30 = arith.addi %arg1, %29 : index
59    %31 = arith.addi %30, %arg4 : index
60    %extracted = tensor.extract %6[%28, %c79, %31] : tensor<45x80x16xf32>
61    linalg.yield %extracted : f32
62  } -> tensor<1x4xf32>
63  return %25 : tensor<1x4xf32>
64}
65
66
67// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_transfer_read_complex(
68// CHECK-SAME:      %[[VAL_0:.*]]: tensor<45x80x16xf32>,
69// CHECK-SAME:      %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index,
70// CHECK-SAME:      %[[VAL_5:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
71// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
72// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
73// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0 : index
74// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 79 : index
75// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : index
76// CHECK:           %[[VAL_13:.*]] = vector.broadcast %[[VAL_3]] : index to vector<4xindex>
77// CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : vector<4xindex>
78// CHECK:           %[[VAL_15:.*]] = vector.broadcast %[[VAL_4]] : index to vector<4xindex>
79// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_15]] : vector<4xindex>
80
81// CHECK:           %[[VAL_19:.*]] = vector.extract %[[VAL_16]][0] : index from vector<4xindex>
82
83// CHECK:           %[[VAL_20:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_11]], %[[VAL_10]], %[[VAL_19]]], %[[VAL_8]] {in_bounds = [true, true]} : tensor<45x80x16xf32>, vector<1x4xf32>
84// CHECK:           %[[VAL_21:.*]] = vector.transfer_write %[[VAL_20]], %[[VAL_5]]{{\[}}%[[VAL_9]], %[[VAL_9]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
85// CHECK:           return %[[VAL_21]] : tensor<1x4xf32>
86// CHECK:         }
87
88// -----
89
90// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
91func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
92  %c79 = arith.constant 79 : index
93  %1 = linalg.generic {
94    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
95    iterator_types = ["parallel", "parallel"]
96  } outs(%extracted_slice : tensor<1x4xf32>) {
97  ^bb0(%out: f32):
98    %2 = linalg.index 1 : index
99    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
100    %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
101    linalg.yield %extracted : f32
102  } -> tensor<1x4xf32>
103  return %1 : tensor<1x4xf32>
104}
105
106// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(
107// CHECK-SAME:                                                                        %[[VAL_0:.*]]: tensor<80x16xf32>,
108// CHECK-SAME:                                                                        %[[VAL_1:.*]]: index,
109// CHECK-SAME:                                                                        %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
110// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
111// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
112// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
113// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 79 : index
114// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
115// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
116// CHECK:           %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex>
117// CHECK:           %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
118// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
119// CHECK:           return %[[VAL_12]] : tensor<1x4xf32>
120// CHECK:         }
121
122// -----
123
124func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> {
125  %c0 = arith.constant 0 : index
126  %c256 = arith.constant 256 : index
127  %output = tensor.empty() : tensor<1x1x4xf32>
128  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) {
129  ^bb0(%out: f32):
130    %13 = linalg.index 0 : index
131    %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2)
132    %15 = linalg.index 2 : index
133    %16 = linalg.index 1 : index
134    %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3)
135    %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32>
136    %18 = arith.index_cast %extracted_0 : i32 to index
137    %19 = arith.maxsi %18, %c0 : index
138    %20 = arith.minsi %19, %c256 : index
139    %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32>
140    linalg.yield %extracted_1 : f32
141  } -> tensor<1x1x4xf32>
142  return %1 : tensor<1x1x4xf32>
143}
144
145// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_tensor_extract(
146// CHECK-SAME:      %[[INPUT_1:.*]]: tensor<1x20xi32>,
147// CHECK-SAME:      %[[INPUT_2:.*]]: tensor<257x24xf32>,
148// CHECK-SAME:      %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index,
149// CHECK:           %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index
150// CHECK:           %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index
151// First `vector.transfer_read` from the generic Op - loop invariant scalar load.
152// CHECK:           vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]]
153// CHECK-SAME:      tensor<1x20xi32>, vector<i32>
154// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used
155// for address calculation also satisfy the required conditions).
156// CHECK:           vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32>
157
158// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
159func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
160  %c16 = arith.constant 16 : index
161  %1 = linalg.generic {
162    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
163    iterator_types = ["parallel", "parallel"]
164  } outs(%extracted_slice : tensor<1x4xf32>) {
165  ^bb0(%out: f32):
166    %2 = linalg.index 0 : index
167    %3 = linalg.index 1 : index
168    %4 = arith.maxsi %2, %c16 : index
169    %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32>
170    linalg.yield %extracted : f32
171  } -> tensor<1x4xf32>
172  return %1 : tensor<1x4xf32>
173}
174
175// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
176// CHECK-SAME:                                                                 %[[VAL_0:.*]]: tensor<80x16xf32>,
177// CHECK-SAME:                                                                 %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
178// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
179// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
180
181// CHECK-DAG:       %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
182// CHECK-DAG:       %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex>
183// CHECK-DAG:       %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex>
184// CHECK-DAG:       %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex>
185
186// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
187// CHECK:           %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
188// CHECK:           return %[[VAL_9]] : tensor<1x4xf32>
189// CHECK:         }
190
191// -----
192
193//===----------------------------------------------------------------------===//
194// Gather load
195//===----------------------------------------------------------------------===//
196
197#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
198#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
199func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> {
200  %1 = linalg.generic {
201    indexing_maps = [#map0, #map1],
202    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
203  } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) {
204  ^bb0(%arg3: i32, %arg4: f32):
205    %2 = arith.index_cast %arg3 : i32 to index
206    %3 = tensor.extract %arg0[%2] : tensor<3xf32>
207    linalg.yield %3 : f32
208  } -> tensor<4x7x3x2xf32>
209  return %1 : tensor<4x7x3x2xf32>
210}
211// CHECK-LABEL: func.func @vectorize_1d_tensor_extract
212// CHECK-SAME:    %[[ARG0:.*]]: tensor<3xf32>
213// CHECK-SAME:    %[[ARG1:.*]]: tensor<4x3xi32>
214// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
215// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
216// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
217// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]]
218// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]]
219// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]]
220// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]]
221// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]]
222// CHECK: vector.transfer_write %[[GATHER]]
223
224// -----
225
226#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
227#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
228#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
229func.func @vectorize_nd_tensor_extract_index_from_tensor(%arg0: tensor<3x3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x3xi32>, %arg3: tensor<4x7x2xf32>, %arg4: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> {
230  %2 = linalg.generic {
231    indexing_maps = [#map0, #map0, #map1, #map2],
232    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
233  } ins(%arg1, %arg2, %arg3 : tensor<4x3xi32>, tensor<4x3xi32>, tensor<4x7x2xf32>) outs(%arg4 : tensor<4x7x3x2xf32>) {
234  ^bb0(%arg5: i32, %arg6: i32, %arg7: f32, %arg8: f32):
235    %3 = arith.index_cast %arg5 : i32 to index
236    %4 = arith.index_cast %arg6 : i32 to index
237    %7 = tensor.extract %arg0[%3, %4] : tensor<3x3xf32>
238    linalg.yield %7 : f32
239  } -> tensor<4x7x3x2xf32>
240  return %2 : tensor<4x7x3x2xf32>
241}
242// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_index_from_tensor
243// CHECK-SAME: %[[ARG0:.*]]: tensor<3x3xf32>
244// CHECK-SAME: %[[ARG1:arg1]]: tensor<4x3xi32>
245// CHECK-SAME: %[[ARG2:arg2]]: tensor<4x3xi32>
246// CHECK-SAME: %[[ARG3:.*]]: tensor<4x7x2xf32>
247// CHECK-SAME: %[[ARG4:.*]]: tensor<4x7x3x2xf32>
248// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
249// CHECK-DAG: %[[C0_i32:.*]] = arith.constant 0 : i32
250// CHECK-DAG: %[[CST:.*]] = arith.constant dense<3> : vector<7x2x4x3xindex>
251// CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
252// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
253// CHECK:    %[[V0:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32>
254// CHECK:    %[[V1:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32>
255// CHECK:    %[[CAST:.*]] = arith.index_cast %[[V0]] : vector<4x3xi32> to vector<4x3xindex>
256// CHECK:    %[[B1:.*]] = vector.broadcast %[[CAST]] : vector<4x3xindex> to vector<7x2x4x3xindex>
257// CHECK:    %[[CAST_1:.*]] = arith.index_cast %[[V1]] : vector<4x3xi32> to vector<4x3xindex>
258// CHECK:    %[[B2:.*]] = vector.broadcast %[[CAST_1]] : vector<4x3xindex> to vector<7x2x4x3xindex>
259// CHECK:    %[[MULI:.*]] = arith.muli %[[B1]], %[[CST]] : vector<7x2x4x3xindex>
260// CHECK:    %[[ADDI:.*]] = arith.addi %[[B2]], %[[MULI]] : vector<7x2x4x3xindex>
261// CHECK:    %[[T:.*]] = vector.transpose %[[ADDI]], [2, 0, 3, 1] : vector<7x2x4x3xindex> to vector<4x7x3x2xindex>
262// CHECK:    %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]]] [%[[T]]], %[[CST_1]], %[[PASSTHRU]] : tensor<3x3xf32>, vector<4x7x3x2xindex>, vector<4x7x3x2xi1>, vector<4x7x3x2xf32> into vector<4x7x3x2xf32>
263// CHECK:    vector.transfer_write %[[GATHER]], %[[ARG4]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true, true]} : vector<4x7x3x2xf32>, tensor<4x7x3x2xf32>
264
265// -----
266
267#map = affine_map<(d0, d1) -> (d0, d1)>
268#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
269func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load(%arg0: tensor<8x128x768xf32>, %arg1 : index) -> tensor<8x1xf32> {
270  %c0 = arith.constant 0 : index
271  %0 = tensor.empty() : tensor<8x1xf32>
272  %1 = linalg.generic {
273    indexing_maps = [#map],
274    iterator_types = ["parallel", "parallel"]
275  } outs(%0 : tensor<8x1xf32>) {
276  ^bb0(%arg5: f32):
277      %2 = linalg.index 0 : index
278      %3 = linalg.index 1 : index
279      %4 = affine.apply #map1(%arg1, %3, %arg1)
280    %extracted = tensor.extract %arg0[%2, %c0, %4] : tensor<8x128x768xf32>
281    linalg.yield %extracted : f32
282  } -> tensor<8x1xf32>
283  return %1 : tensor<8x1xf32>
284}
285
286// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load
287// CHECK-SAME: %[[ARG0:.*]]: tensor<8x128x768xf32>
288// CHECK-SAME: %[[ARG1:.*]]: index
289// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
290// CHECK-DAG: %[[CST:.*]] = arith.constant dense<768> : vector<1x8xindex>
291// CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<128> : vector<1x8xindex>
292// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
293// CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<true> : vector<8x1xi1>
294// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
295// CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<8x1xf32>
296// CHECK: %[[B1:.*]] = vector.broadcast %[[CST_3]] : vector<8xindex> to vector<1x8xindex>
297// CHECK: %[[ADDI_ARG1:.*]] = arith.addi %[[ARG1]], %[[ARG1]] : index
298// CHECK: %[[B2:.*]] = vector.broadcast %[[ADDI_ARG1]] : index to vector<1xindex>
299// CHECK: %[[MULI_1:.*]] = arith.muli %[[B1]], %[[CST_0]] : vector<1x8xindex>
300// CHECK: %[[MULI_2:.*]] = arith.muli %[[MULI_1]], %[[CST]] : vector<1x8xindex>
301// CHECK: %[[T:.*]] = vector.transpose %[[MULI_2]], [1, 0] : vector<1x8xindex> to vector<8x1xindex>
302// CHECK: %[[B3:.*]] = vector.broadcast %[[B2]] : vector<1xindex> to vector<8x1xindex>
303// CHECK: %[[ADDI:.*]] = arith.addi %[[B3]], %[[T]] : vector<8x1xindex>
304// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[ADDI]]], %[[CST_2]], %[[PASSTHRU]] : tensor<8x128x768xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32>
305// CHECK: vector.transfer_write %[[GATHER]], %[[EMPTY]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32>
306
307// -----
308
309// Reading a 1D column vector (hence a candidate for a contiguous load), but given
310// %1, it's a gather load.
311
312#map = affine_map<(d0, d1) -> (d0, d1)>
313func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
314  %c0 = arith.constant 0 : index
315  %0 = tensor.empty() : tensor<8x1xf32>
316  %res = linalg.generic {
317    indexing_maps = [#map],
318    iterator_types = ["parallel", "parallel"]
319  } outs(%0 : tensor<8x1xf32>) {
320  ^bb0(%arg1: f32):
321      %1 = linalg.index 0 : index
322    %extracted = tensor.extract %src[%1, %c0] : tensor<8x128xf32>
323      linalg.yield %extracted : f32
324  } -> tensor<8x1xf32>
325  return %res : tensor<8x1xf32>
326}
327
328// CHECK-LABEL:   func.func @index_from_output_column_vector_gather_load(
329// CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
330// CHECK:           %[[C128:.*]] = arith.constant dense<128> : vector<1x8xindex>
331// CHECK:           %[[C0:.*]] = arith.constant 0 : index
332// CHECK:           %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
333// CHECK:           %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1>
334// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
335// CHECK:           %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32>
336// CHECK:           %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex>
337// CHECK:           %[[MUL:.*]] = arith.muli %[[B]], %[[C128]] : vector<1x8xindex>
338// CHECK:           %[[TR:.*]] = vector.transpose %[[MUL]], [1, 0] : vector<1x8xindex> to vector<8x1xindex>
339// CHECK:           %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32>
340// CHECK:           %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32>
341// CHECK:           return %[[RES]] : tensor<8x1xf32>
342
343// -----
344
345// Same as above, but the access indices have been swapped and hence this _is_
346// a contiguous load. Currently not supported and lowered as vector.gather
347// instead.
348// TODO: Make sure that this is lowered as a contiguous load.
349
350#map = affine_map<(d0, d1) -> (d0, d1)>
351func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
352  %c0 = arith.constant 0 : index
353  %0 = tensor.empty() : tensor<8x1xf32>
354  %res = linalg.generic {
355    indexing_maps = [#map],
356    iterator_types = ["parallel", "parallel"]
357  } outs(%0 : tensor<8x1xf32>) {
358  ^bb0(%arg1: f32):
359      %1 = linalg.index 0 : index
360    %extracted = tensor.extract %src[%c0, %1] : tensor<8x128xf32>
361      linalg.yield %extracted : f32
362  } -> tensor<8x1xf32>
363  return %res : tensor<8x1xf32>
364}
365
366// CHECK-LABEL:   func.func @index_from_output_column_vector_contiguous_load(
367// CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
368// CHECK:           %[[C0:.*]] = arith.constant 0 : index
369// CHECK:           %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
370// CHECK:           %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1>
371// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
372// CHECK:           %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32>
373// CHECK:           %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex>
374// CHECK:           %[[TR:.*]] = vector.transpose %[[B]], [1, 0] : vector<1x8xindex> to vector<8x1xindex>
375// CHECK:           %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32>
376// CHECK:           %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32>
377// CHECK:           return %[[RES]] : tensor<8x1xf32>
378
379// -----
380
381#map = affine_map<(d0) -> (d0)>
382func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32>, %arg1: tensor<5xi32>) -> tensor<5xf32> {
383 %c5 = arith.constant 5 : index
384 %c0 = arith.constant 0 : index
385 %0 = tensor.empty() : tensor<5xf32>
386 %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<5xf32>) {
387 ^bb0(%out: f32):
388   %2 = linalg.index 0 : index
389   %extracted = tensor.extract %arg1[%2] : tensor<5xi32>
390   %3 = arith.index_cast %extracted : i32 to index
391   %4 = arith.maxsi %3, %c0 : index
392   %5 = arith.minsi %4, %c5 : index
393   %extracted_0 = tensor.extract %arg0[%5] : tensor<6xf32>
394   linalg.yield %extracted_0 : f32
395 } -> tensor<5xf32>
396 return %1 : tensor<5xf32>
397}
398
399// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_contiguous_and_gather(
400// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<6xf32>
401// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<5xi32>
402// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
403// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
404// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<0> : vector<5xindex>
405// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<5> : vector<5xindex>
406// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<true> : vector<5xi1>
407// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
408// CHECK:           %[[VAL_8:.*]] = tensor.empty() : tensor<5xf32>
409// CHECK:           %[[VAL_9:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%{{.*}}], %[[VAL_3]] {in_bounds = [true]} : tensor<5xi32>, vector<5xi32>
410// CHECK:           %[[VAL_10:.*]] = arith.index_cast %[[VAL_9]] : vector<5xi32> to vector<5xindex>
411// CHECK:           %[[VAL_11:.*]] = arith.maxsi %[[VAL_10]], %[[VAL_4]] : vector<5xindex>
412// CHECK:           %[[VAL_12:.*]] = arith.minsi %[[VAL_11]], %[[VAL_5]] : vector<5xindex>
413// CHECK:           %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]]] {{\[}}%[[VAL_12]]], %[[VAL_6]], %[[VAL_7]] : tensor<6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32>
414// CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
415// CHECK:           return %[[VAL_14]] : tensor<5xf32>
416
417// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
418func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
419  %c16 = arith.constant 16 : index
420  %1 = linalg.generic {
421    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
422    iterator_types = ["parallel", "parallel"]
423  } outs(%extracted_slice : tensor<1x4xf32>) {
424  ^bb0(%out: f32):
425    %2 = linalg.index 1 : index
426    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
427    %extracted = tensor.extract %6[%3, %c16] : tensor<80x16xf32>
428    linalg.yield %extracted : f32
429  } -> tensor<1x4xf32>
430  return %1 : tensor<1x4xf32>
431}
432
433// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(
434// CHECK-SAME:                                                                    %[[VAL_0:.*]]: tensor<80x16xf32>,
435// CHECK-SAME:                                                                    %[[VAL_1:.*]]: index,
436// CHECK-SAME:                                                                    %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
437// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
438// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
439// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
440// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
441// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex>
442// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
443// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
444// CHECK:           %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex>
445// CHECK:           %[[VAL_11:.*]] = arith.muli %[[VAL_10]], %[[VAL_7]] : vector<1x4xindex>
446// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_7]] : vector<1x4xindex>
447// CHECK:           %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_12]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
448// CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
449// CHECK:           return %[[VAL_14]] : tensor<1x4xf32>
450// CHECK:         }
451
452// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
453func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
454  %c79 = arith.constant 79 : index
455  %1 = linalg.generic {
456    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
457    iterator_types = ["parallel", "parallel"]
458  } outs(%extracted_slice : tensor<1x4xf32>) {
459  ^bb0(%out: f32):
460    %2 = linalg.index 1 : index
461    %3 = arith.maxsi %2, %c79 : index
462    %extracted = tensor.extract %arg0[%3, %2] : tensor<80x16xf32>
463    linalg.yield %extracted : f32
464  } -> tensor<1x4xf32>
465  return %1 : tensor<1x4xf32>
466}
467
468// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_gather(
469// CHECK-SAME:                                                             %[[VAL_0:.*]]: tensor<80x16xf32>,
470// CHECK-SAME:                                                             %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
471// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
472// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex>
473// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
474// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
475// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
476// CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex>
477// CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex>
478// CHECK:           %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
479// CHECK:           %[[VAL_10:.*]] = vector.transfer_write %[[VAL_9]], %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
480// CHECK:           return %[[VAL_10]] : tensor<1x4xf32>
481// CHECK:         }
482
483// -----
484
485// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
486#map = affine_map<(d0) -> (d0)>
487func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> {
488 %0 = tensor.empty() : tensor<5xf32>
489 %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg1: tensor<5xindex>) outs(%0 : tensor<5xf32>) {
490 ^bb0(%in: index, %out: f32):
491   %2 = linalg.index 0 : index
492   %extracted_0 = tensor.extract %arg0[%in, %2] : tensor<5x6xf32>
493   linalg.yield %extracted_0 : f32
494 } -> tensor<5xf32>
495 return %1 : tensor<5xf32>
496}
497
498// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_block_arg(
499// CHECK-SAME:                                                     %[[VAL_0:.*]]: tensor<5x6xf32>,
500// CHECK-SAME:                                                     %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> {
501// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
502// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex>
503// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1>
504// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
505// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex>
506// CHECK:           %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32>
507// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex>
508// CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex>
509// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_3]] : vector<5xindex>
510// CHECK:           %[[VAL_11:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]]] {{\[}}%[[VAL_10]]], %[[VAL_4]], %[[VAL_5]] : tensor<5x6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32>
511// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_7]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
512// CHECK:           return %[[VAL_12]] : tensor<5xf32>
513// CHECK:         }
514
515// -----
516
517#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
518#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
519func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
520  %c1 = arith.constant 1 : index
521  %c0 = arith.constant 0 : index
522  %c2 = arith.constant 2 : index
523  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) {
524  ^bb0(%out: f32):
525    %1 = linalg.index 1 : index
526    %2 = linalg.index 0 : index
527    %3 = affine.apply #map1(%1, %2, %arg2)
528    %4 = linalg.index 2 : index
529    %5 = arith.subi %c2, %4 : index
530    %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32>
531    linalg.yield %extracted : f32
532  } -> tensor<1x1x3xf32>
533  return %0 : tensor<1x1x3xf32>
534}
535// CHECK-LABEL: func.func @vectorize_reverse_like_tensor_extract
536// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]
537// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]
538// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]
539// CHECK-DAG:    %[[CST:.+]] = arith.constant dense<3> : vector<1x1x3xindex>
540// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
541// CHECK-DAG:    %[[MASK:.*]] = arith.constant dense<true> : vector<1x1x3xi1>
542// CHECK-DAG:    %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
543// CHECK-DAG:    %[[INIT_IDX:.+]] = arith.constant dense<[2, 1, 0]> : vector<3xindex>
544// CHECK:        %[[T0:.+]] = vector.broadcast %[[ARG2]] : index to vector<1x1x3xindex>
545// CHECK:        %[[T1:.+]] = arith.muli %[[T0]], %[[CST]] : vector<1x1x3xindex>
546// CHECK:        %[[T2:.+]] = vector.broadcast %[[INIT_IDX]]
547// CHECK:        %[[T3:.+]] = arith.addi %[[T2]], %[[T1]]
548// CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
549// CHECK:        vector.transfer_write %[[GATHER]]
550
551//===----------------------------------------------------------------------===//
552// Scalar load + broadcast
553//===----------------------------------------------------------------------===//
554
555// -----
556
557#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
558func.func @vectorize_nd_tensor_extract_scalar_broadcast(%src: tensor<3x3xf32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
559  %c0 = arith.constant 1 : index
560  %c1 = arith.constant 2 : index
561
562  %res = linalg.generic {
563    indexing_maps = [#map],
564    iterator_types = ["parallel", "parallel", "parallel"]
565  } outs(%init : tensor<1x1x3xf32>) {
566  ^bb0(%arg4: f32):
567    %1 = tensor.extract %src[%c0, %c1] : tensor<3x3xf32>
568    linalg.yield %1 : f32
569  } -> tensor<1x1x3xf32>
570
571  return %res : tensor<1x1x3xf32>
572}
573
574// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_scalar_broadcast(
575// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3xf32>,
576// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
577// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
578// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
579// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
580// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
581// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C1]], %[[C2]]], %[[PAD]] : tensor<3x3xf32>, vector<f32>
582// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
583// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x3xf32>, tensor<1x1x3xf32>
584
585// -----
586
587#map = affine_map<() -> ()>
588func.func @extract_scalar_from_0d_into_0d(%src: tensor<f32>, %init: tensor<f32>) -> tensor<f32> {
589  %res = linalg.generic {
590    indexing_maps = [#map],
591    iterator_types = []
592  } outs(%init : tensor<f32>) {
593  ^bb0(%in: f32):
594    %1 = tensor.extract %src[] : tensor<f32>
595    linalg.yield %1 : f32
596  } -> tensor<f32>
597
598  return %res : tensor<f32>
599}
600
601// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_0d(
602// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
603// CHECK-SAME:      %[[INIT:.*]]: tensor<f32>) -> tensor<f32> {
604// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
605// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
606// CHECK:           vector.transfer_write %[[READ]], %[[INIT]][] : vector<f32>, tensor<f32>
607
608// -----
609
610#map = affine_map<(n) -> (n)>
611func.func @extract_scalar_from_0d_into_1d(%src: tensor<f32>, %init: tensor<1xf32>) -> tensor<1xf32> {
612  %res = linalg.generic {
613    indexing_maps = [#map],
614    iterator_types = ["parallel"]
615  } outs(%init : tensor<1xf32>) {
616  ^bb0(%in: f32):
617    %1 = tensor.extract %src[] : tensor<f32>
618    linalg.yield %1 : f32
619  } -> tensor<1xf32>
620
621  return %res : tensor<1xf32>
622}
623// CHECK-LABEL:   func.func @extract_scalar_from_0d_into_1d(
624// CHECK-SAME:      %[[SRC:.*]]: tensor<f32>,
625// CHECK-SAME:      %[[INIT:.*]]: tensor<1xf32>) -> tensor<1xf32> {
626// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
627// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
628// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %[[PAD]] : tensor<f32>, vector<f32>
629// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<1xf32>
630// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]]] {in_bounds = [true]} : vector<1xf32>, tensor<1xf32>
631
632// -----
633
634#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
635func.func @vectorize_0d_tensor_extract(%src: tensor<f32>, %init: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
636  %res = linalg.generic {
637    indexing_maps = [#map1],
638    iterator_types = ["parallel", "parallel", "parallel"]
639  } outs(%init : tensor<1x1x3xf32>) {
640  ^bb0(%arg4: f32):
641    %1 = tensor.extract %src[] : tensor<f32>
642    linalg.yield %1 : f32
643  } -> tensor<1x1x3xf32>
644  return %res : tensor<1x1x3xf32>
645}
646
647// CHECK-LABEL:   func.func @vectorize_0d_tensor_extract(
648// CHECK-SAME:     %[[SRC:.*]]: tensor<f32>
649// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][], %{{.+}} : tensor<f32>
650// CHECK:           vector.broadcast %[[READ]] : vector<f32> to vector<1x1x3xf32>
651
652// -----
653
654func.func @scalar_read_with_broadcast_from_column_tensor(%init: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
655  %c4 = arith.constant 4 : index
656  %c0 = arith.constant 0 : index
657  %src = arith.constant dense<[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
658
659  %res = linalg.generic {
660    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
661    iterator_types = ["parallel", "parallel", "parallel"]
662  } outs(%init : tensor<1x1x4xi32>) {
663
664    ^bb0(%out: i32):
665      %idx = linalg.index 0 : index
666      %extracted = tensor.extract %src[%idx, %c0] : tensor<15x1xi32>
667      linalg.yield %extracted : i32
668  } -> tensor<1x1x4xi32>
669
670  return %res : tensor<1x1x4xi32>
671}
672
673// CHECK-LABEL:   func.func @scalar_read_with_broadcast_from_column_tensor
674// CHECK-SAME:      %[[INIT:.*]]: tensor<1x1x4xi32>) -> tensor<1x1x4xi32> {
675// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0 : i32
676// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
677// CHECK-DAG:       %[[SRC:.*]] = arith.constant dense<{{\[\[}}0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14]]> : tensor<15x1xi32>
678// CHECK-DAG:       %[[IDX_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
679// CHECK:           %[[IDX_ELT:.*]] = vector.extract %[[IDX_VEC]][0] : index from vector<1xindex>
680// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]]{{\[}}%[[IDX_ELT]], %[[C0]]], %[[PAD]] : tensor<15x1xi32>, vector<i32>
681// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<i32> to vector<1x1x4xi32>
682// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ_BCAST]], %[[INIT]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x1x4xi32>, tensor<1x1x4xi32>
683
684// -----
685
686// TODO: Currently this fails to vectorise when the indices are non-constant.
687
688#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
689func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
690    %src: tensor<3x3x3xf32>,
691    %init: tensor<3x1x1xf32>) -> tensor<3x1x1xf32> {
692
693  %c0 = arith.constant 0 : index
694
695  %res = linalg.generic {
696    indexing_maps = [#map],
697    iterator_types = ["parallel", "parallel", "parallel"]
698  } outs(%init : tensor<3x1x1xf32>) {
699  ^bb0(%out: f32):
700    %1 = tensor.extract %src[%c0, %c0, %c0] : tensor<3x3x3xf32>
701    linalg.yield %1 : f32
702  } -> tensor<3x1x1xf32>
703
704  return %res : tensor<3x1x1xf32>
705}
706
707// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_transfer_read_basic_column(
708// CHECK-SAME:      %[[SRC:.*]]: tensor<3x3x3xf32>,
709// CHECK-SAME:      %[[INIT:.*]]: tensor<3x1x1xf32>)
710// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
711// CHECK-DAG:       %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
712// CHECK:           %[[READ:.*]] = vector.transfer_read %[[SRC]][%[[C0]], %[[C0]], %[[C0]]], %[[CST_0]] : tensor<3x3x3xf32>, vector<f32>
713// CHECK:           %[[READ_BCAST:.*]] = vector.broadcast %[[READ]] : vector<f32> to vector<3x1x1xf32>
714// CHECK:           vector.transfer_write %[[READ_BCAST]], %[[INIT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<3x1x1xf32>, tensor<3x1x1xf32>
715