xref: /llvm-project/mlir/test/Dialect/Linalg/vectorization.mlir (revision d9d623310de196f2119fc593811204993916e26e)
1// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
2
3func.func @vectorize_dynamic_identity(%arg0: tensor<?xf32>,
4                                      %arg1: tensor<?xf32>,
5                                      %arg2: tensor<?xf32>) -> tensor<?xf32> {
6  %0 = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
7                                         affine_map<(d0) -> (d0)>,
8                                         affine_map<(d0) -> (d0)>],
9                   iterator_types = ["parallel"] }
10    ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
11    outs(%arg2 : tensor<?xf32>) {
12    ^bb(%in0: f32, %in1: f32, %out: f32) :
13      %0 = arith.addf %in0, %in1 : f32
14      linalg.yield %0 : f32
15    } -> tensor<?xf32>
16  return %0 : tensor<?xf32>
17}
18
19// CHECK-LABEL:   @vectorize_dynamic_identity
20// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
21// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32>
22// CHECK:           %[[VAL_7:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1>
23// CHECK:           %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
24// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
25// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
26// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<4xf32>
27// CHECK:           %[[VAL_14:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %{{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
28
29module attributes {transform.with_named_sequence} {
30  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
31    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
32    transform.structured.vectorize %0 vector_sizes [4] : !transform.any_op
33    transform.yield
34  }
35}
36
37// -----
38
39func.func @vectorize_dynamic_identity_with_constant(%arg0: tensor<?xf32>,
40                                                    %arg1: tensor<?xf32>,
41                                                    %arg2: tensor<?xf32>) -> tensor<?xf32> {
42  %c4 = arith.constant 4 : index
43  %0 = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
44                                         affine_map<(d0) -> (d0)>,
45                                         affine_map<(d0) -> (d0)>],
46                   iterator_types = ["parallel"] }
47    ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
48    outs(%arg2 : tensor<?xf32>) {
49    ^bb(%in0: f32, %in1: f32, %out: f32) :
50      %0 = arith.addf %in0, %in1 : f32
51      linalg.yield %0 : f32
52    } -> tensor<?xf32>
53  return %0 : tensor<?xf32>
54}
55
56// CHECK-LABEL:   @vectorize_dynamic_identity_with_constant
57// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
58// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32>
59// CHECK:           %[[VAL_7:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1>
60// CHECK:           %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
61// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
62// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
63// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<4xf32>
64// CHECK:           %[[VAL_14:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %{{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
65
66module attributes {transform.with_named_sequence} {
67  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
68    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
69    %size = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
70    transform.structured.vectorize %0 vector_sizes [%size] : !transform.any_op, !transform.any_op
71    transform.yield
72  }
73}
74
75// -----
76
77func.func @vectorize_dynamic_identity_with_param(%arg0: tensor<?xf32>,
78                                                 %arg1: tensor<?xf32>,
79                                                 %arg2: tensor<?xf32>) -> tensor<?xf32> {
80  %0 = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
81                                         affine_map<(d0) -> (d0)>,
82                                         affine_map<(d0) -> (d0)>],
83                   iterator_types = ["parallel"] }
84    ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
85    outs(%arg2 : tensor<?xf32>) {
86    ^bb(%in0: f32, %in1: f32, %out: f32) :
87      %0 = arith.addf %in0, %in1 : f32
88      linalg.yield %0 : f32
89    } -> tensor<?xf32>
90  return %0 : tensor<?xf32>
91}
92
93// CHECK-LABEL:   @vectorize_dynamic_identity_with_param
94// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
95// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32>
96// CHECK:           %[[VAL_7:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1>
97// CHECK:           %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
98// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
99// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
100// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<4xf32>
101// CHECK:           %[[VAL_14:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %{{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
102
103module attributes {transform.with_named_sequence} {
104  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
105    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
106    %vector_size = transform.param.constant 4 : i64 -> !transform.param<i64>
107    transform.structured.vectorize %0 vector_sizes [%vector_size] : !transform.any_op, !transform.param<i64>
108    transform.yield
109  }
110}
111
112// -----
113
114func.func @vectorize_dynamic_1d_broadcast(%arg0: tensor<?xf32>,
115                                          %arg1: tensor<?xf32>,
116                                          %arg2: tensor<?xf32>) -> tensor<?xf32> {
117  %0 = linalg.generic { indexing_maps = [affine_map<(d0) -> (0)>,
118                                         affine_map<(d0) -> (d0)>,
119                                         affine_map<(d0) -> (d0)>],
120                        iterator_types = ["parallel"] }
121    ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
122    outs(%arg2 : tensor<?xf32>) {
123    ^bb(%in0: f32, %in1: f32, %out: f32) :
124      %0 = arith.addf %in0, %in1 : f32
125      linalg.yield %0 : f32
126    } -> tensor<?xf32>
127  return %0 : tensor<?xf32>
128}
129
130// CHECK-LABEL:   @vectorize_dynamic_1d_broadcast
131// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
132// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?xf32>
133// CHECK:           %[[VAL_7:.*]] = vector.transfer_read %{{.*}} {permutation_map = #{{.*}}} : tensor<?xf32>, vector<4xf32>
134// CHECK:           %[[VAL_9:.*]] = vector.create_mask %[[VAL_4]] : vector<4xi1>
135// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
136// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
137// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_7]], %[[VAL_10]] : vector<4xf32>
138// CHECK:           %[[VAL_14:.*]] = vector.mask %{{.*}} { vector.transfer_write %[[VAL_13]], {{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
139
140module attributes {transform.with_named_sequence} {
141  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
142    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
143    transform.structured.vectorize %0 vector_sizes [4] : !transform.any_op
144    transform.yield
145  }
146}
147
148// -----
149
150#map = affine_map<(d0, d1) -> (d0, d1)>
151#map1 = affine_map<(d0, d1) -> (d0, 0)>
152
153func.func @dynamic_generic_with_reduction_and_broadcast(%arg0: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
154  %0 = linalg.generic { indexing_maps = [#map, #map1],
155                        iterator_types = ["parallel", "reduction"]}
156    ins(%arg0 : tensor<?x?xf32>)
157    outs(%init : tensor<?x?xf32>) {
158  ^bb0(%in: f32, %out: f32):
159    %1 = arith.addf %in, %out : f32
160    linalg.yield %1 : f32
161  } -> tensor<?x?xf32>
162  return %0 : tensor<?x?xf32>
163}
164// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0)>
165
166// CHECK-LABEL:   func.func @dynamic_generic_with_reduction_and_broadcast(
167// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf32>,
168// CHECK-SAME:      %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
169// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
170// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xf32>
171// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
172// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32>
173// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
174// CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
175// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]] : vector<4x4xi1>
176// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_7]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x4xf32> } : vector<4x4xi1> -> vector<4x4xf32>
177// CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
178// CHECK:           %[[VAL_11:.*]] = vector.create_mask %[[VAL_3]] : vector<4xi1>
179// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]], %[[VAL_10]] {in_bounds = [true], permutation_map = #[[$MAP]]} : tensor<?x?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
180// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.multi_reduction <add>, %[[VAL_9]], %[[VAL_12]] [1] : vector<4x4xf32> to vector<4xf32> } : vector<4x4xi1> -> vector<4xf32>
181// CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
182// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_11]] { vector.transfer_write %[[VAL_13]], %[[VAL_1]]{{\[}}%[[VAL_14]], %[[VAL_14]]] {in_bounds = [true], permutation_map = #[[$MAP]]} : vector<4xf32>, tensor<?x?xf32> } : vector<4xi1> -> tensor<?x?xf32>
183// CHECK:           return %[[VAL_15]] : tensor<?x?xf32>
184
185module attributes {transform.with_named_sequence} {
186  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
187    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
188    transform.structured.vectorize %0 vector_sizes [4, 4] : !transform.any_op
189    transform.yield
190  }
191}
192
193// -----
194
195func.func @vectorize_dynamic_2d_transpose(%arg0: tensor<?x?xf32>,
196                                          %arg1: tensor<?x?xf32>,
197                                          %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
198  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
199                                         affine_map<(d0, d1) -> (d0, d1)>,
200                                         affine_map<(d0, d1) -> (d0, d1)>],
201                        iterator_types = ["parallel", "parallel"] }
202    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
203    outs(%arg2 : tensor<?x?xf32>) {
204    ^bb(%in0: f32, %in1: f32, %out: f32) :
205      %0 = arith.addf %in0, %in1 : f32
206      linalg.yield %0 : f32
207    } -> tensor<?x?xf32>
208    return %0 : tensor<?x?xf32>
209}
210
211// CHECK-LABEL:   @vectorize_dynamic_2d_transpose
212// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
213// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?x?xf32>
214// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
215// CHECK:           %[[VAL_6:.*]] = tensor.dim %{{.*}}, %[[VAL_5]] : tensor<?x?xf32>
216// CHECK:           %[[VAL_9:.*]] = vector.create_mask %[[VAL_6]], %[[VAL_4]] : vector<8x4xi1>
217// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : tensor<?x?xf32>, vector<4x8xf32> } : vector<8x4xi1> -> vector<4x8xf32>
218// CHECK:           %[[VAL_12:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<4x8xi1>
219// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_12]] { vector.transfer_read %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x8xf32> } : vector<4x8xi1> -> vector<4x8xf32>
220// CHECK:           %[[VAL_14:.*]] = arith.constant 0.000000e+00 : f32
221// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_12]] { vector.transfer_read %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x8xf32> } : vector<4x8xi1> -> vector<4x8xf32>
222// CHECK:           %[[VAL_16:.*]] = arith.addf %[[VAL_10]], %[[VAL_13]] : vector<4x8xf32>
223// CHECK:           %[[VAL_17:.*]] = vector.mask %[[VAL_12]] { vector.transfer_write %[[VAL_16]], %{{.*}} {in_bounds = [true, true]} : vector<4x8xf32>, tensor<?x?xf32> } : vector<4x8xi1> -> tensor<?x?xf32>
224
225module attributes {transform.with_named_sequence} {
226  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
227    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
228    transform.structured.vectorize %0 vector_sizes [4, 8] : !transform.any_op
229    transform.yield
230  }
231}
232
233// -----
234
235func.func @vectorize_dynamic_generic_2d_broadcast(%arg0: tensor<?x?xf32>,
236                                                  %arg1: tensor<?x?xf32>,
237                                                  %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
238  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (0, d1)>,
239                                         affine_map<(d0, d1) -> (d0, d1)>,
240                                         affine_map<(d0, d1) -> (d0, d1)>],
241                        iterator_types = ["parallel", "parallel"] }
242    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
243    outs(%arg2 : tensor<?x?xf32>) {
244    ^bb(%in0: f32, %in1: f32, %out: f32) :
245      %0 = arith.addf %in0, %in1 : f32
246      linalg.yield %0 : f32
247    } -> tensor<?x?xf32>
248  return %0 : tensor<?x?xf32>
249}
250
251// CHECK-LABEL:   @vectorize_dynamic_generic_2d_broadcast
252// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
253// CHECK:           %[[VAL_4:.*]] = tensor.dim %{{.*}}, %[[VAL_3]] : tensor<?x?xf32>
254// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
255// CHECK:           %[[VAL_6:.*]] = tensor.dim %{{.*}}, %[[VAL_5]] : tensor<?x?xf32>
256// CHECK:           %[[VAL_9:.*]] = vector.create_mask %[[VAL_6]] : vector<8xi1>
257// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : tensor<?x?xf32>, vector<4x8xf32> } : vector<8xi1> -> vector<4x8xf32>
258// CHECK:           %[[VAL_12:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<4x8xi1>
259// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_12]] { vector.transfer_read %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x8xf32> } : vector<4x8xi1> -> vector<4x8xf32>
260// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_12]] { vector.transfer_read %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x8xf32> } : vector<4x8xi1> -> vector<4x8xf32>
261// CHECK:           %[[VAL_16:.*]] = arith.addf %[[VAL_10]], %[[VAL_13]] : vector<4x8xf32>
262// CHECK:           %[[VAL_18:.*]] = vector.mask %[[VAL_12]] { vector.transfer_write %{{.*}} {in_bounds = [true, true]} : vector<4x8xf32>, tensor<?x?xf32> } : vector<4x8xi1> -> tensor<?x?xf32>
263
264module attributes {transform.with_named_sequence} {
265  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
266    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
267    transform.structured.vectorize %0 vector_sizes [4, 8] : !transform.any_op
268    transform.yield
269  }
270}
271
272// -----
273
274func.func @vectorize_dynamic_reduction(%arg0: tensor<?x?xf32>,
275                                       %arg1: tensor<?xf32>) -> tensor<?xf32> {
276  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
277                                         affine_map<(d0, d1) -> (d0)>],
278                        iterator_types = ["parallel", "reduction"] }
279    ins(%arg0 : tensor<?x?xf32>)
280    outs(%arg1 : tensor<?xf32>) {
281    ^bb(%in: f32, %out: f32) :
282      %0 = arith.addf %in, %out : f32
283      linalg.yield %0 : f32
284    } -> tensor<?xf32>
285  return %0 : tensor<?xf32>
286}
287
288module attributes {transform.with_named_sequence} {
289  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
290    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
291    transform.structured.vectorize %0 vector_sizes [4, 8] : !transform.any_op
292    transform.yield
293  }
294}
295
296// CHECK-LABEL:   @vectorize_dynamic_reduction(
297// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<?x?xf32>,
298// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
299// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
300// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xf32>
301// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
302// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32>
303// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]] : vector<4x8xi1>
304// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]]{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<4x8xf32> } : vector<4x8xi1> -> vector<4x8xf32>
305// CHECK:           %[[VAL_11:.*]] = vector.create_mask %[[VAL_3]] : vector<4xi1>
306// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_11]] { vector.transfer_read %[[VAL_1]]{{.*}} {in_bounds = [true]} : tensor<?xf32>, vector<4xf32> } : vector<4xi1> -> vector<4xf32>
307// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.multi_reduction <add>, %[[VAL_9]], %[[VAL_12]] [1] : vector<4x8xf32> to vector<4xf32> } : vector<4x8xi1> -> vector<4xf32>
308// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_11]] { vector.transfer_write %[[VAL_13]], %[[VAL_1]]{{.*}} {in_bounds = [true]} : vector<4xf32>, tensor<?xf32> } : vector<4xi1> -> tensor<?xf32>
309// CHECK:           return %[[VAL_15]] : tensor<?xf32>
310// CHECK:         }
311
312// -----
313
314func.func @vectorize_dynamic_transpose_reduction(%arg0: tensor<?x?x?xf32>,
315                                                 %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
316  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
317                                         affine_map<(d0, d1, d2) -> (d2, d1)>],
318                        iterator_types = ["reduction", "parallel", "parallel"] }
319    ins(%arg0 : tensor<?x?x?xf32>)
320    outs(%arg1 : tensor<?x?xf32>) {
321    ^bb(%in: f32, %out: f32) :
322      %0 = arith.addf %in, %out : f32
323      linalg.yield %0 : f32
324    } -> tensor<?x?xf32>
325  return %0 : tensor<?x?xf32>
326}
327
328module attributes {transform.with_named_sequence} {
329  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
330    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
331    transform.structured.vectorize %0 vector_sizes [4, 8, 16] : !transform.any_op
332    transform.yield
333  }
334}
335
336// CHECK-LABEL:   @vectorize_dynamic_transpose_reduction(
337// CHECK-SAME:                                           %[[VAL_0:.*]]: tensor<?x?x?xf32>,
338// CHECK-SAME:                                           %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
339// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
340// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32>
341// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
342// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
343// CHECK:           %[[VAL_6:.*]] = arith.constant 2 : index
344// CHECK:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?x?xf32>
345// CHECK:           %[[VAL_10:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]], %[[VAL_7]] : vector<4x8x16xi1>
346// CHECK:           %[[VAL_11:.*]] = vector.mask %[[VAL_10]] { vector.transfer_read %[[VAL_0]]{{.*}} {in_bounds = [true, true, true]} : tensor<?x?x?xf32>, vector<4x8x16xf32> } : vector<4x8x16xi1> -> vector<4x8x16xf32>
347// CHECK:           %[[VAL_13:.*]] = vector.create_mask %[[VAL_7]], %[[VAL_5]] : vector<16x8xi1>
348// CHECK:           %[[VAL_14:.*]] = vector.mask %[[VAL_13]] { vector.transfer_read %[[VAL_1]]{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : tensor<?x?xf32>, vector<8x16xf32> } : vector<16x8xi1> -> vector<8x16xf32>
349// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_10]] { vector.multi_reduction <add>, %[[VAL_11]], %[[VAL_14]] [0] : vector<4x8x16xf32> to vector<8x16xf32> } : vector<4x8x16xi1> -> vector<8x16xf32>
350// CHECK:           %[[VAL_17:.*]] = vector.mask %[[VAL_13]] { vector.transfer_write %[[VAL_15]], %{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : vector<8x16xf32>, tensor<?x?xf32> } : vector<16x8xi1> -> tensor<?x?xf32>
351
352// -----
353
354func.func @vectorize_dynamic_transpose_reduction_with_params(%arg0: tensor<?x?x?xf32>,
355                                                             %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
356  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
357                                         affine_map<(d0, d1, d2) -> (d2, d1)>],
358                        iterator_types = ["reduction", "parallel", "parallel"] }
359    ins(%arg0 : tensor<?x?x?xf32>)
360    outs(%arg1 : tensor<?x?xf32>) {
361    ^bb(%in: f32, %out: f32) :
362      %0 = arith.addf %in, %out : f32
363      linalg.yield %0 : f32
364    } -> tensor<?x?xf32>
365  return %0 : tensor<?x?xf32>
366}
367
368module attributes {transform.with_named_sequence} {
369  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
370    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
371    %vector_size_0 = transform.param.constant 4 : i64 -> !transform.param<i64>
372    %vector_size_2 = transform.param.constant 16 : i64 -> !transform.param<i64>
373    transform.structured.vectorize %0 vector_sizes
374      [%vector_size_0, 8, %vector_size_2] : !transform.any_op, !transform.param<i64>, !transform.param<i64>
375    transform.yield
376  }
377}
378
379// CHECK-LABEL:   @vectorize_dynamic_transpose_reduction_with_params(
380// CHECK-SAME:                                           %[[VAL_0:.*]]: tensor<?x?x?xf32>,
381// CHECK-SAME:                                           %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
382// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
383// CHECK:           %[[VAL_3:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32>
384// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
385// CHECK:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
386// CHECK:           %[[VAL_6:.*]] = arith.constant 2 : index
387// CHECK:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?x?xf32>
388// CHECK:           %[[VAL_10:.*]] = vector.create_mask %[[VAL_3]], %[[VAL_5]], %[[VAL_7]] : vector<4x8x16xi1>
389// CHECK:           %[[VAL_11:.*]] = vector.mask %[[VAL_10]] { vector.transfer_read %[[VAL_0]]{{.*}} {in_bounds = [true, true, true]} : tensor<?x?x?xf32>, vector<4x8x16xf32> } : vector<4x8x16xi1> -> vector<4x8x16xf32>
390// CHECK:           %[[VAL_13:.*]] = vector.create_mask %[[VAL_7]], %[[VAL_5]] : vector<16x8xi1>
391// CHECK:           %[[VAL_14:.*]] = vector.mask %[[VAL_13]] { vector.transfer_read %[[VAL_1]]{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : tensor<?x?xf32>, vector<8x16xf32> } : vector<16x8xi1> -> vector<8x16xf32>
392// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_10]] { vector.multi_reduction <add>, %[[VAL_11]], %[[VAL_14]] [0] : vector<4x8x16xf32> to vector<8x16xf32> } : vector<4x8x16xi1> -> vector<8x16xf32>
393// CHECK:           %[[VAL_17:.*]] = vector.mask %[[VAL_13]] { vector.transfer_write %[[VAL_15]], %{{.*}} {in_bounds = [true, true], permutation_map = #{{.*}}} : vector<8x16xf32>, tensor<?x?xf32> } : vector<16x8xi1> -> tensor<?x?xf32>
394
395// -----
396
397func.func @vectorize_partial_dynamic_identity(%arg0: tensor<8x?xf32>,
398                                              %arg1: tensor<8x?xf32>,
399                                              %arg2: tensor<8x?xf32>) -> tensor<8x?xf32> {
400  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
401                                         affine_map<(d0, d1) -> (d0, d1)>,
402                                         affine_map<(d0, d1) -> (d0, d1)>],
403                   iterator_types = ["parallel", "parallel"] }
404    ins(%arg0, %arg1 : tensor<8x?xf32>, tensor<8x?xf32>)
405    outs(%arg2 : tensor<8x?xf32>) {
406    ^bb(%in0: f32, %in1: f32, %out: f32) :
407      %0 = arith.addf %in0, %in1 : f32
408      linalg.yield %0 : f32
409    } -> tensor<8x?xf32>
410  return %0 : tensor<8x?xf32>
411}
412
413// CHECK-LABEL:   func.func @vectorize_partial_dynamic_identity(
414// CHECK-SAME:      %[[VAL_0:.*]]: tensor<8x?xf32>, %[[VAL_1:.*]]: tensor<8x?xf32>, %[[VAL_2:.*]]: tensor<8x?xf32>) -> tensor<8x?xf32> {
415// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
416// CHECK-DAG:       %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<8x?xf32>
417// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
418// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
419// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 8 : index
420// CHECK:           %[[VAL_8:.*]] = vector.create_mask %[[VAL_7]], %[[VAL_4]] : vector<8x32xi1>
421// CHECK:           %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_0]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_6]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
422// CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
423// CHECK:           %[[VAL_11:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_1]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_10]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
424// CHECK:           %[[VAL_12:.*]] = arith.constant 0.000000e+00 : f32
425// CHECK:           %[[VAL_13:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read %[[VAL_2]][%[[VAL_5]], %[[VAL_5]]], %[[VAL_12]] {in_bounds = [true, true]} : tensor<8x?xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
426// CHECK:           %[[VAL_14:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] : vector<8x32xf32>
427// CHECK:           %[[VAL_15:.*]] = arith.constant 0 : index
428// CHECK:           %[[VAL_16:.*]] = vector.mask %[[VAL_8]] { vector.transfer_write %[[VAL_14]], %[[VAL_2]][%[[VAL_15]], %[[VAL_15]]] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x?xf32> } : vector<8x32xi1> -> tensor<8x?xf32>
429
430
431module attributes {transform.with_named_sequence} {
432  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
433    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
434    transform.structured.vectorize %0 vector_sizes [8, 32] : !transform.any_op
435    transform.yield
436  }
437}
438
439// -----
440
441func.func @do_not_generate_masks(%arg0: tensor<8x32xf32>,
442                                 %arg1: tensor<8x32xf32>,
443                                 %arg2: tensor<8x32xf32>) -> tensor<8x32xf32> {
444  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
445                                         affine_map<(d0, d1) -> (d0, d1)>,
446                                         affine_map<(d0, d1) -> (d0, d1)>],
447                   iterator_types = ["parallel", "parallel"] }
448    ins(%arg0, %arg1 : tensor<8x32xf32>, tensor<8x32xf32>)
449    outs(%arg2 : tensor<8x32xf32>) {
450    ^bb(%in0: f32, %in1: f32, %out: f32) :
451      %0 = arith.addf %in0, %in1 : f32
452      linalg.yield %0 : f32
453    } -> tensor<8x32xf32>
454  return %0 : tensor<8x32xf32>
455}
456
457// CHECK-LABEL: func.func @do_not_generate_masks
458// CHECK-NOT: vector.mask
459
460module attributes {transform.with_named_sequence} {
461  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
462    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
463    transform.structured.vectorize %0 vector_sizes [8, 32] : !transform.any_op
464    transform.yield
465  }
466}
467
468// -----
469
470func.func @vectorize_static_shape_with_mask(%arg0: tensor<8x30xf32>,
471                                            %arg1: tensor<8x30xf32>,
472                                            %arg2: tensor<8x30xf32>) -> tensor<8x30xf32> {
473  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
474                                         affine_map<(d0, d1) -> (d0, d1)>,
475                                         affine_map<(d0, d1) -> (d0, d1)>],
476                   iterator_types = ["parallel", "parallel"] }
477    ins(%arg0, %arg1 : tensor<8x30xf32>, tensor<8x30xf32>)
478    outs(%arg2 : tensor<8x30xf32>) {
479    ^bb(%in0: f32, %in1: f32, %out: f32) :
480      %0 = arith.addf %in0, %in1 : f32
481      linalg.yield %0 : f32
482    } -> tensor<8x30xf32>
483  return %0 : tensor<8x30xf32>
484}
485
486// CHECK-LABEL:   func.func @vectorize_static_shape_with_mask(
487// CHECK-SAME:      %[[VAL_0:.*]]: tensor<8x30xf32>, %[[VAL_1:.*]]: tensor<8x30xf32>, %[[VAL_2:.*]]: tensor<8x30xf32>) -> tensor<8x30xf32> {
488// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
489// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
490// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
491// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 30 : index
492// CHECK:           %[[VAL_7:.*]] = vector.create_mask %[[VAL_5]], %[[VAL_6]] : vector<8x32xi1>
493// CHECK:           %[[VAL_8:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_0]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_4]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
494// CHECK:           %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
495// CHECK:           %[[VAL_10:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_1]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_9]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
496// CHECK:           %[[VAL_11:.*]] = arith.constant 0.000000e+00 : f32
497// CHECK:           %[[VAL_12:.*]] = vector.mask %[[VAL_7]] { vector.transfer_read %[[VAL_2]][%[[VAL_3]], %[[VAL_3]]], %[[VAL_11]] {in_bounds = [true, true]} : tensor<8x30xf32>, vector<8x32xf32> } : vector<8x32xi1> -> vector<8x32xf32>
498// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_10]] : vector<8x32xf32>
499// CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
500// CHECK:           %[[VAL_15:.*]] = vector.mask %[[VAL_7]] { vector.transfer_write %[[VAL_13]], %[[VAL_2]][%[[VAL_14]], %[[VAL_14]]] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<8x30xf32> } : vector<8x32xi1> -> tensor<8x30xf32>
501
502module attributes {transform.with_named_sequence} {
503  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
504    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
505    transform.structured.vectorize %0 vector_sizes [8, 32] : !transform.any_op
506    transform.yield
507  }
508}
509
510// -----
511
512func.func @vectorize_dynamic_fill(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> {
513  %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
514  return %0 : tensor<?x?xf32>
515}
516
517// CHECK-LABEL: func.func @vectorize_dynamic_fill
518//   CHECK: %[[DIM0:.*]] = tensor.dim
519//   CHECK: %[[DIM1:.*]] = tensor.dim
520//   CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<8x16xi1>
521//   CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<8x16xf32>
522//   CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<8x16xf32>, tensor<?x?xf32> } : vector<8x16xi1>
523
524module attributes {transform.with_named_sequence} {
525  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
526    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
527    transform.structured.vectorize %0 vector_sizes [8, 16] : !transform.any_op
528    transform.yield
529  }
530}
531
532// -----
533
534// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
535// CHECK: func @test_masked_vectorize_linalg_transpose
536func.func @test_masked_vectorize_linalg_transpose(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
537  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
538  // CHECK-DAG:  %[[D0:.*]] = tensor.dim %arg0, %[[C0]]
539  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
540  // CHECK-DAG:  %[[D1:.*]] = tensor.dim %arg0, %[[C1]]
541  // CHECK:      %[[MASK0:.*]] = vector.create_mask %[[D0]], %[[D1]]
542  // CHECK:      %[[LOAD:.*]] = vector.mask %[[MASK0]] { vector.transfer_read %arg0{{.+}} permutation_map = #[[MAP]]{{.+}} }
543  // CHECK-SAME:   vector<4x2xi1> -> vector<2x4xf32>
544  // CHECK:      %[[MASK1:.*]] = vector.create_mask %[[D1]], %[[D0]]
545  // CHECK:      %[[WRITE:.*]] = vector.mask %[[MASK1]] { vector.transfer_write %[[LOAD]], %arg1{{.+}} }
546  // CHECK-SAME:   vector<2x4xi1> -> tensor<?x?xf32>
547  // CHECK:      return %[[WRITE]]
548  %0 = linalg.transpose ins(%arg0 : tensor<?x?xf32>) outs(%arg1 : tensor<?x?xf32>) permutation = [1, 0]
549  return %0 : tensor<?x?xf32>
550}
551
552module attributes {transform.with_named_sequence} {
553  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
554    %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op
555    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
556    transform.yield
557  }
558}
559
560// -----
561
562// CHECK-LABEL: func @test_masked_vectorize_linalg_copy
563func.func @test_masked_vectorize_linalg_copy(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
564  // CHECK: %[[c0:.*]] = arith.constant 0 : index
565  // CHECK: %[[d0:.*]] = memref.dim %{{.*}}, %[[c0]] : memref<?x?xf32>
566  // CHECK: %[[c1:.*]] = arith.constant 1 : index
567  // CHECK: %[[d1:.*]] = memref.dim %{{.*}}, %[[c1]] : memref<?x?xf32>
568  // CHECK: %[[mask:.*]] = vector.create_mask %[[d0]], %[[d1]] : vector<2x4xi1>
569  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_read %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32> } : vector<2x4xi1> -> vector<2x4xf32>
570  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_write %{{.*}} {in_bounds = [true, true]} : vector<2x4xf32>, memref<?x?xf32> } : vector<2x4xi1>
571  linalg.copy ins(%A : memref<?x?xf32>) outs(%B : memref<?x?xf32>)
572  return
573}
574
575module attributes {transform.with_named_sequence} {
576  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
577    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
578    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
579    transform.yield
580  }
581}
582
583// -----
584
585// CHECK-LABEL: func @test_masked_vectorize_pad
586func.func @test_masked_vectorize_pad(
587  %0 : tensor<?x?xf32>, %h0 : index, %h1 : index)
588    -> tensor<2x4xf32>
589{
590  //  CHECK-DAG: %[[c42:.*]] = arith.constant 4.243000e+01 : f32
591  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
592  //  CHECK-DAG: %[[c0_0:.*]] = arith.constant 0 : index
593  //      CHECK: %[[d0:.*]] = tensor.dim {{.*}} : tensor<?x?xf32>
594  //      CHECK: %[[d1:.*]] = tensor.dim {{.*}} : tensor<?x?xf32>
595  //      CHECK: %[[mask:.*]] = vector.create_mask %[[d0]], %[[d1]] : vector<2x4xi1>
596  //      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
597  // CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0_0]], %[[c0_0]]], %[[c42]]
598  // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<2x4xf32>
599  // CHECK-SAME: } : vector<2x4xi1> -> vector<2x4xf32>
600  //  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
601  //  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4xf32>
602  //      CHECK: vector.transfer_write %[[masked_read]], %[[empty]][%[[c0_1]], %[[c0_1]]]
603  // CHECK-SAME:   {in_bounds = [true, true]} : vector<2x4xf32>, tensor<2x4xf32>
604  %cst = arith.constant 42.43 : f32
605  %c0 = arith.constant 0 : index
606  %1 = tensor.pad %0 low[0, %c0] high[%h0, %h1]  {
607    ^bb0(%hh1: index, %hh2: index):
608      tensor.yield %cst : f32
609    } : tensor<?x?xf32> to tensor<2x4xf32>
610  return %1: tensor<2x4xf32>
611}
612
613module attributes {transform.with_named_sequence} {
614  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
615    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
616      : (!transform.any_op) -> !transform.any_op
617    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
618    transform.yield
619  }
620}
621
622// -----
623
624//       CHECK: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
625//       CHECK: func @test_masked_vectorize_dynamic_pad
626func.func @test_masked_vectorize_dynamic_pad(
627  %0 : tensor<?x?xf32>, %h0 : index, %h1 : index)
628    -> tensor<?x?xf32>
629{
630  //  CHECK-DAG: %[[c42:.*]] = arith.constant 4.243000e+01 : f32
631  //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
632  //  CHECK-DAG: %[[res_d0:.+]] = affine.apply #[[MAP]]()
633  //  CHECK-DAG: %[[res_d1:.+]] = affine.apply #[[MAP]]()
634  //      CHECK: %[[c0_2:.*]] = arith.constant 0 : index
635  //      CHECK: %[[d0:.*]] = tensor.dim {{.*}} : tensor<?x?xf32>
636  //      CHECK: %[[d1:.*]] = tensor.dim {{.*}} : tensor<?x?xf32>
637  //      CHECK: %[[mask:.*]] = vector.create_mask %[[d0]], %[[d1]] : vector<2x4xi1>
638  //      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
639  // CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0_2]], %[[c0_2]]], %[[c42]]
640  // CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<2x4xf32>
641  // CHECK-SAME: } : vector<2x4xi1> -> vector<2x4xf32>
642  //  CHECK-DAG: %[[empty:.*]] = tensor.empty(%[[res_d0]], %[[res_d1]]) : tensor<?x?xf32>
643  //  CHECK-DAG: %[[c0_3:.*]] = arith.constant 0 : index
644  //      CHECK: %[[mask_2:.*]] = vector.create_mask %[[res_d0]], %[[res_d1]] : vector<2x4xi1>
645  //      CHECK: %[[masked_write:.*]] = vector.mask %[[mask_2]] {
646  // CHECK-SAME: vector.transfer_write %[[masked_read]], %[[empty]][%[[c0_3]], %[[c0_3]]]
647  // CHECK-SAME:   {in_bounds = [true, true]} : vector<2x4xf32>, tensor<?x?xf32>
648  //      CHECK: return %[[masked_write]] : tensor<?x?xf32>
649  %cst = arith.constant 42.43 : f32
650  %c0 = arith.constant 0 : index
651  %1 = tensor.pad %0 low[0, %c0] high[%h0, %h1]  {
652    ^bb0(%hh1: index, %hh2: index):
653      tensor.yield %cst : f32
654    } : tensor<?x?xf32> to tensor<?x?xf32>
655  return %1: tensor<?x?xf32>
656}
657
658module attributes {transform.with_named_sequence} {
659  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
660    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
661      : (!transform.any_op) -> !transform.any_op
662    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
663    transform.yield
664  }
665}
666
667// -----
668
669// Input identical as the test in vectorization-with-patterns.mlir. Output is
670// different - vector sizes are inferred (rather than user-specified) and hence
671// masking was used.
672
673func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
674  %pack = tensor.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
675  return %pack : tensor<4x1x32x16x2xf32>
676}
677//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
678//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
679//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
680// CHECK-SAME:    {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32>
681//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
682//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
683//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
684//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<4x1x32x16x2xf32>
685//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
686// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
687//      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
688
689module attributes {transform.with_named_sequence} {
690  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
691    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
692    transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op
693    transform.yield
694  }
695}
696
697// -----
698
699// Input identical as the test in vectorization-with-patterns.mlir. Output is
700// different - vector sizes are inferred (rather than user-specified) and hence
701// masking was used.
702
703func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
704  %pad = arith.constant 0.000000e+00 : f32
705  %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
706  return %pack : tensor<32x4x1x16x2xf32>
707}
708//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
709//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
710//  CHECK-DAG: %[[c32:.*]] = arith.constant 32 : index
711//  CHECK-DAG: %[[c7:.*]] = arith.constant 7 : index
712//  CHECK-DAG: %[[c15:.*]] = arith.constant 15 : index
713//      CHECK: %[[mask:.*]] = vector.create_mask %[[c32]], %[[c7]], %[[c15]] : vector<32x8x16xi1>
714//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
715// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
716// CHECK-SAME:   {in_bounds = [true, true, true]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
717// CHECK-SAME: } : vector<32x8x16xi1> -> vector<32x8x16xf32>
718//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
719//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
720//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
721//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
722//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
723// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
724//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
725
726module attributes {transform.with_named_sequence} {
727  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
728    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
729    transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op
730    transform.yield
731  }
732}
733
734// -----
735
736func.func @test_vectorize_dynamic_pack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
737  %pack = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
738  return %pack : tensor<?x?x16x2xf32>
739}
740//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
741//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
742//  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
743//  CHECK-DAG: %[[d0:.*]] = tensor.dim {{.*}} %[[c0]] : tensor<?x?x16x2xf32>
744//  CHECK-DAG: %[[d1:.*]] = tensor.dim {{.*}} %[[c1]] : tensor<?x?x16x2xf32>
745//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
746//  CHECK-DAG: %[[c0_0:.*]] = arith.constant 0 : index
747//  CHECK-DAG: %[[c1_0:.*]] = arith.constant 1 : index
748//  CHECK-DAG: %[[d0_0:.*]] = tensor.dim {{.*}} %[[c0_0]] : tensor<?x?xf32>
749//  CHECK-DAG: %[[d1_0:.*]] = tensor.dim {{.*}} %[[c1_0]] : tensor<?x?xf32>
750//      CHECK: %[[mask:.*]] = vector.create_mask %[[d0_0]], %[[d1_0]] : vector<8x16xi1>
751//      CHECK: %[[masked_read:.*]] = vector.mask %[[mask]] {
752// CHECK-SAME:   vector.transfer_read %{{.*}}[%[[c0_1]], %[[c0_1]]], %[[cst]]
753// CHECK-SAME:   {in_bounds = [true, true]} : tensor<?x?xf32>, vector<8x16xf32>
754// CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32>
755//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[masked_read]] : vector<8x16xf32> to vector<4x2x1x16xf32>
756//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32>
757//  CHECK-DAG: %[[c0_2:.*]] = arith.constant 0 : index
758//  CHECK-DAG: %[[c16:.*]] = arith.constant 16 : index
759//  CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
760//  CHECK-DAG: %[[empty:.*]] = tensor.empty(%[[d0]], %[[d1]]) : tensor<?x?x16x2xf32>
761//      CHECK: %[[mask_0:.*]] = vector.create_mask %[[d0]], %[[d1]], %[[c16]], %[[c2]] : vector<4x1x16x2xi1>
762//      CHECK: %[[masked_write:.*]] = vector.mask %[[mask_0]] {
763// CHECK-SAME:   vector.transfer_write %[[transpose]], %[[empty]][%[[c0_2]], %[[c0_2]], %[[c0_2]], %[[c0_2]]]
764// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor<?x?x16x2xf32>
765//      CHECK: return %[[masked_write]] : tensor<?x?x16x2xf32>
766
767module attributes {transform.with_named_sequence} {
768  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
769    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
770    transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op
771    transform.yield
772  }
773}
774
775// -----
776
777func.func @matmul(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
778  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
779            outs(%C: memref<?x?xf32>)
780  return
781}
782
783// CHECK-LABEL:   func.func @matmul(
784// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
785// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
786// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
787// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
788// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
789// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
790// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
791// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
792// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32>
793// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x16xi1>
794// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32>
795// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x16xi1>
796// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x16xf32> } : vector<8x16xi1> -> vector<8x16xf32>
797// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x16x4xf32>
798// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x16x4xi1>
799// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x16x4xf32> to vector<8x16xf32> } : vector<8x16x4xi1> -> vector<8x16xf32>
800// CHECK:           %[[C2:.*]] = arith.constant 0 : index
801// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x16xf32>, memref<?x?xf32> } : vector<8x16xi1>
802
803module attributes {transform.with_named_sequence} {
804  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
805    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
806    transform.structured.vectorize %matmul vector_sizes [8, 16, 4] : !transform.any_op
807    transform.yield
808  }
809}
810
811// -----
812
813func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
814  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
815               outs(%C_in: memref<16x16x8x8xf32>)
816  return
817}
818
819// CHECK-LABEL:   func.func @mmt4d(
820// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
821// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
822// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
823// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
824// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
825// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
826// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
827
828module attributes {transform.with_named_sequence} {
829  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
830    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
831    transform.structured.vectorize %mmt4d : !transform.any_op
832    transform.yield
833  }
834}
835
836// -----
837
838func.func @matmul_scalable(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
839  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
840            outs(%C: memref<?x?xf32>)
841  return
842}
843
844// CHECK-LABEL:   func.func @matmul_scalable(
845// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
846// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
847// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
848// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
849// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
850// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
851// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
852// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
853// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<8x4xi1> -> vector<8x[16]x4xf32>
854// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x[16]xi1>
855// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<4x[16]xi1> -> vector<8x[16]x4xf32>
856// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x[16]xi1>
857// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x[16]xf32> } : vector<8x[16]xi1> -> vector<8x[16]xf32>
858// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x[16]x4xf32>
859// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x[16]x4xi1>
860// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x[16]x4xf32> to vector<8x[16]xf32> } : vector<8x[16]x4xi1> -> vector<8x[16]xf32>
861// CHECK:           %[[C2:.*]] = arith.constant 0 : index
862// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x[16]xf32>, memref<?x?xf32> } : vector<8x[16]xi1>
863
864module attributes {transform.with_named_sequence} {
865  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
866    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
867    transform.structured.vectorize %matmul vector_sizes [8, [16], 4] : !transform.any_op
868    transform.yield
869  }
870}
871
872// -----
873
874// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
875func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
876// CHECK: %[[C0:.*]] = arith.constant 0
877// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
878// CHECK: %[[C1:.*]] = arith.constant 1 : index
879// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor<?x?xf32>
880// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
881// CHECK: %[[C01:.*]] = arith.constant 0
882// CHECK: %[[C02:.*]] = arith.constant 0
883// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor<?x?x16x2xf32>
884// CHECK: %[[CNST14:.*]] = arith.constant 1
885// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor<?x?x16x2xf32>
886// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
887// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
888// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
889// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
890// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
891// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
892// CHECK: %[[empt0:.*]] = tensor.empty
893// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
894// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
895// CHECK: return %[[write0]]
896 %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
897 return %ret : tensor<?x?xf32>
898}
899module attributes {transform.with_named_sequence} {
900 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
901   %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
902   transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
903   transform.yield
904 }
905}
906
907// -----
908
909// CHECK-LABEL: func @test_vectorize_unpack
910func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
911    // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
912    // CHECK: %[[C0:.*]]= arith.constant 0 : index
913    // CHECK: %[[C8:.*]] = arith.constant 8 : index
914    // CHECK: %[[C80:.*]] = arith.constant 8 : index
915    // CHECK: %[[C32:.*]] = arith.constant 32 : index
916    // CHECK: %[[C16:.*]] = arith.constant 16 : index
917    // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1>
918    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
919    // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
920    // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
921    // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
922    // CHECK: %[[C01:.*]] = arith.constant 0 : index
923    // CHECK: %[[C256:.*]] = arith.constant 256 : index
924    // CHECK: %[[C128:.*]] = arith.constant 128 : index
925    // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
926    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32>
927    // CHECK: return %[[WRIT]] : tensor<256x128xf32>
928   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
929   return %0 : tensor<256x128xf32>
930 }
931 module attributes {transform.with_named_sequence} {
932  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
933    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
934   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
935    transform.yield
936  }
937}
938
939// -----
940
941// CHECK-LABEL: func @test_vectorize_unpack_no_masks
942func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
943  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
944  // CHECK: %[[C0:.*]] = arith.constant 0 : index
945  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
946  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
947  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
948  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
949  // CHECK: %[[C00:.*]] = arith.constant 0 : index
950  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
951  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
952   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
953   return %0 : tensor<256x128xf32>
954 }
955 module attributes {transform.with_named_sequence} {
956  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
957    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
958   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
959    transform.yield
960  }
961 }
962
963  // -----
964
965  // CHECK-LABEL: test_vectorize_unpack_with_outer_perm
966  func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
967  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
968  // CHECK: %[[C0:.*]] = arith.constant 0 : index
969  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
970  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
971  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
972  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
973  // CHECK: %[[C00:.*]] = arith.constant 0 : index
974  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
975  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
976   %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
977   return %0 : tensor<256x128xf32>
978 }
979 module attributes {transform.with_named_sequence} {
980  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
981    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
982   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
983    transform.yield
984  }
985}
986
987  // -----
988
989// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
990func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
991  %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
992  return %pack : tensor<2x4x16x2xf32>
993}
994//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
995//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
996//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
997// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
998//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
999//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
1000//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
1001//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
1002//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
1003// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
1004//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
1005
1006module attributes {transform.with_named_sequence} {
1007  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
1008    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
1009    transform.structured.vectorize %0 : !transform.any_op
1010    transform.yield
1011  }
1012}
1013
1014  // -----
1015
1016// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
1017func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
1018  %pad = arith.constant 0.000000e+00 : f32
1019  %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
1020  return %pack : tensor<32x4x1x16x2xf32>
1021}
1022//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
1023//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
1024//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
1025// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
1026//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
1027//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
1028//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
1029//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
1030//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
1031// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
1032//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
1033
1034module attributes {transform.with_named_sequence} {
1035  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
1036    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
1037    transform.structured.vectorize %0 : !transform.any_op
1038    transform.yield
1039  }
1040}
1041
1042  // -----
1043
1044func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
1045  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
1046  // CHECK: %[[C0:.*]] = arith.constant 0 : index
1047  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
1048  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
1049  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
1050  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
1051  // CHECK: %[[C00:.*]] = arith.constant 0 : index
1052  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
1053  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
1054   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
1055   return %0 : tensor<256x128xf32>
1056 }
1057 module attributes {transform.with_named_sequence} {
1058  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
1059    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
1060   transform.structured.vectorize %0 : !transform.any_op
1061    transform.yield
1062  }
1063 }
1064
1065  // -----
1066
1067func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> {
1068  //      CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
1069  //      CHECK: %[[C0:.*]] = arith.constant 0 : index
1070  //      CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
1071  //      CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32>
1072  //      CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32>
1073  //      CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<64x127xf32>
1074  //      CHECK: %[[C00:.*]] = arith.constant 0 : index
1075  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]]
1076  // CHECK-SAME:  {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32>
1077  //      CHECK: return %[[WRIT]] : tensor<64x127xf32>
1078   %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32>
1079   return %0 : tensor<64x127xf32>
1080 }
1081 module attributes {transform.with_named_sequence} {
1082  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
1083    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
1084   transform.structured.vectorize %0 : !transform.any_op
1085    transform.yield
1086  }
1087 }
1088
1089  // -----
1090
1091func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> {
1092   %0 = tensor.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32>
1093   return %0 : tensor<7x16xf32>
1094 }
1095  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
1096  // CHECK: %[[C0:.*]] = arith.constant 0 : index
1097  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
1098  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32>
1099  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32>
1100  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<7x16xf32>
1101  // CHECK: %[[C00:.*]] = arith.constant 0 : index
1102  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<7x16xf32>, tensor<7x16xf32>
1103  // CHECK: return %[[WRIT]] : tensor<7x16xf32>
1104 module attributes {transform.with_named_sequence} {
1105  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
1106    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
1107   transform.structured.vectorize %0 : !transform.any_op
1108    transform.yield
1109  }
1110 }
1111