xref: /llvm-project/mlir/test/Dialect/Vector/vector-warp-distribute.mlir (revision ecaf2c335cd612646086ec53315cb1018a5b9d91)
1// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \
2// RUN:   --test-vector-warp-distribute=rewrite-warp-ops-to-scf-if | FileCheck %s --check-prefix=CHECK-SCF-IF
3
4// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \
5// RUN:   --test-vector-warp-distribute="hoist-uniform" | FileCheck --check-prefixes=CHECK-HOIST %s
6
7// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \
8// RUN:   --test-vector-warp-distribute="hoist-uniform distribute-transfer-write max-transfer-write-elements=4" \
9// RUN:   | FileCheck --check-prefixes=CHECK-D %s
10
11// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \
12// RUN:  --test-vector-warp-distribute=propagate-distribution --canonicalize \
13// RUN:  | FileCheck --check-prefixes=CHECK-PROP %s
14
15// RUN: mlir-opt %s --allow-unregistered-dialect --split-input-file \
16// RUN:   --test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
17// RUN:   --canonicalize | FileCheck --check-prefixes=CHECK-DIST-AND-PROP %s
18
19// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)>
20// CHECK-SCF-IF-DAG: #[[$TIMES4:.*]] = affine_map<()[s0] -> (s0 * 4)>
21// CHECK-SCF-IF-DAG: #[[$TIMES8:.*]] = affine_map<()[s0] -> (s0 * 8)>
22// CHECK-SCF-IF-DAG: memref.global "private" @__shared_32xf32 : memref<32xf32, 3>
23// CHECK-SCF-IF-DAG: memref.global "private" @__shared_64xf32 : memref<64xf32, 3>
24// CHECK-SCF-IF-DAG: memref.global "private" @__shared_128xf32 : memref<128xf32, 3>
25// CHECK-SCF-IF-DAG: memref.global "private" @__shared_256xf32 : memref<256xf32, 3>
26
27// CHECK-SCF-IF-LABEL: func @rewrite_warp_op_to_scf_if(
28//  CHECK-SCF-IF-SAME:     %[[laneid:.*]]: index,
29//  CHECK-SCF-IF-SAME:     %[[v0:.*]]: vector<4xf32>, %[[v1:.*]]: vector<8xf32>)
30func.func @rewrite_warp_op_to_scf_if(%laneid: index,
31                                %v0: vector<4xf32>, %v1: vector<8xf32>) {
32//   CHECK-SCF-IF-DAG:   %[[c0:.*]] = arith.constant 0 : index
33//       CHECK-SCF-IF:   %[[is_lane_0:.*]] = arith.cmpi eq, %[[laneid]], %[[c0]]
34
35//       CHECK-SCF-IF:   %[[buffer_v0:.*]] = memref.get_global @__shared_128xf32
36//       CHECK-SCF-IF:   %[[s0:.*]] = affine.apply #[[$TIMES4]]()[%[[laneid]]]
37//       CHECK-SCF-IF:   vector.transfer_write %[[v0]], %[[buffer_v0]][%[[s0]]]
38//       CHECK-SCF-IF:   %[[buffer_v1:.*]] = memref.get_global @__shared_256xf32
39//       CHECK-SCF-IF:   %[[s1:.*]] = affine.apply #[[$TIMES8]]()[%[[laneid]]]
40//       CHECK-SCF-IF:   vector.transfer_write %[[v1]], %[[buffer_v1]][%[[s1]]]
41
42//   CHECK-SCF-IF-DAG:   gpu.barrier
43//   CHECK-SCF-IF-DAG:   %[[buffer_def_0:.*]] = memref.get_global @__shared_32xf32
44//   CHECK-SCF-IF-DAG:   %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32
45
46//       CHECK-SCF-IF:   scf.if %[[is_lane_0]] {
47  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
48      args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) {
49    ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>):
50//       CHECK-SCF-IF:     %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32>
51//       CHECK-SCF-IF:     %[[arg0:.*]] = vector.transfer_read %[[buffer_v0]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, 3>, vector<128xf32>
52//       CHECK-SCF-IF:     %[[def_0:.*]] = "some_def"(%[[arg0]]) : (vector<128xf32>) -> vector<32xf32>
53//       CHECK-SCF-IF:     %[[def_1:.*]] = "some_def"(%[[arg1]]) : (vector<256xf32>) -> vector<64xf32>
54    %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
55    %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32>
56//       CHECK-SCF-IF:     vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]]
57//       CHECK-SCF-IF:     vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]]
58    gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
59  }
60//       CHECK-SCF-IF:   }
61//       CHECK-SCF-IF:   gpu.barrier
62//       CHECK-SCF-IF:   %[[o1:.*]] = affine.apply #[[$TIMES2]]()[%[[laneid]]]
63//       CHECK-SCF-IF:   %[[r1:.*]] = vector.transfer_read %[[buffer_def_1]][%[[o1]]], %{{.*}} {in_bounds = [true]} : memref<64xf32, 3>, vector<2xf32>
64//       CHECK-SCF-IF:   %[[r0:.*]] = vector.transfer_read %[[buffer_def_0]][%[[laneid]]], %{{.*}} {in_bounds = [true]} : memref<32xf32, 3>, vector<1xf32>
65//       CHECK-SCF-IF:   "some_use"(%[[r0]]) : (vector<1xf32>) -> ()
66//       CHECK-SCF-IF:   "some_use"(%[[r1]]) : (vector<2xf32>) -> ()
67  "some_use"(%r#0) : (vector<1xf32>) -> ()
68  "some_use"(%r#1) : (vector<2xf32>) -> ()
69  return
70}
71
72// -----
73
74// CHECK-D-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 * 2 + 32)>
75
76// CHECK-DIST-AND-PROP-LABEL: func @warp(
77// CHECK-HOIST: memref.subview
78// CHECK-HOIST: memref.subview
79// CHECK-HOIST: memref.subview
80// CHECK-HOIST: gpu.warp_execute_on_lane_0
81
82//     CHECK-D: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) {
83//     CHECK-D:   arith.addf {{.*}} : vector<32xf32>
84//     CHECK-D:   arith.addf {{.*}} : vector<64xf32>
85//     CHECK-D:   gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32>
86// CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32
87// CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}]
88// CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32
89
90// CHECK-DIST-AND-PROP-NOT: gpu.warp_execute_on_lane_0
91// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
92// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
93// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32>
94// CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32>
95// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<1xf32>
96// CHECK-DIST-AND-PROP: arith.addf {{.*}} : vector<2xf32>
97// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<1xf32>
98// CHECK-DIST-AND-PROP: vector.transfer_write {{.*}} : vector<2xf32>
99
100func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>,
101           %arg3: memref<1024xf32>, %gid : index) {
102  gpu.warp_execute_on_lane_0(%laneid)[32] {
103    %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
104    %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
105    %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
106    %c0 = arith.constant 0 : index
107    %c32 = arith.constant 32 : index
108    %cst = arith.constant 0.000000e+00 : f32
109    %2 = vector.transfer_read %sa[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32>
110    %3 = vector.transfer_read %sa[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<32xf32>
111    %4 = vector.transfer_read %sb[%c0], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32>
112    %5 = vector.transfer_read %sb[%c32], %cst : memref<128xf32, strided<[1], offset: ?>>, vector<64xf32>
113    %6 = arith.addf %2, %3 : vector<32xf32>
114    %7 = arith.addf %4, %5 : vector<64xf32>
115    vector.transfer_write %6, %sc[%c0] : vector<32xf32>, memref<128xf32, strided<[1], offset: ?>>
116    vector.transfer_write %7, %sc[%c32] : vector<64xf32>, memref<128xf32, strided<[1], offset: ?>>
117  }
118  return
119}
120
121// -----
122
123// CHECK-D-LABEL: func @warp_extract(
124//       CHECK-D:   %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>)
125//       CHECK-D:     "test.dummy_op"
126//       CHECK-D:     "test.dummy_op"
127//       CHECK-D:     gpu.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32>
128//       CHECK-D:   }
129//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
130//       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32>
131//       CHECK-D:   }
132//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
133//       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32>
134//       CHECK-D:   }
135
136func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
137  gpu.warp_execute_on_lane_0(%laneid)[32] {
138    %c0 = arith.constant 0 : index
139    %v = "test.dummy_op"() : () -> (vector<1xf32>)
140    %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>)
141    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<1x1xf32>, memref<1024x1024xf32>
142    vector.transfer_write %v, %arg1[%c0, %c0] : vector<1xf32>, memref<1024x1024xf32>
143  }
144  return
145}
146
147// -----
148
149// Check that we can distribute writes of the maximum allowed number of elements.
150
151// CHECK-D-LABEL: func @warp_extract_4_elems(
152//       CHECK-D:   %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>)
153//       CHECK-D:     "test.dummy_op"
154//       CHECK-D:     "test.dummy_op"
155//       CHECK-D:     gpu.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32>
156//       CHECK-D:   }
157//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
158//       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<4x1xf32>
159//       CHECK-D:   }
160//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
161//       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<4xf32>
162//       CHECK-D:   }
163
164func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
165  gpu.warp_execute_on_lane_0(%laneid)[32] {
166    %c0 = arith.constant 0 : index
167    %v = "test.dummy_op"() : () -> (vector<4xf32>)
168    %v1 = "test.dummy_op"() : () -> (vector<4x1xf32>)
169    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<4x1xf32>, memref<1024x1024xf32>
170    vector.transfer_write %v, %arg1[%c0, %c0] : vector<4xf32>, memref<1024x1024xf32>
171  }
172  return
173}
174
175// -----
176
177// Check that we do not distribute writes larger than the maximum allowed
178// number of elements.
179
180// CHECK-D-LABEL: func @warp_extract_5_elems(
181//       CHECK-D:   arith.constant 0 : index
182//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
183//       CHECK-D:     %[[V:.+]] = "test.dummy_op"
184//       CHECK-D:     %[[V1:.+]] = "test.dummy_op"
185//       CHECK-D:     vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<5x1xf32>
186//       CHECK-D:     vector.transfer_write %[[V]], %{{.*}}[%{{.*}}] {{.*}} : vector<5xf32>
187//       CHECK-D:   }
188
189func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
190  gpu.warp_execute_on_lane_0(%laneid)[32] {
191    %c0 = arith.constant 0 : index
192    %v = "test.dummy_op"() : () -> (vector<5xf32>)
193    %v1 = "test.dummy_op"() : () -> (vector<5x1xf32>)
194    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<5x1xf32>, memref<1024x1024xf32>
195    vector.transfer_write %v, %arg1[%c0, %c0] : vector<5xf32>, memref<1024x1024xf32>
196  }
197  return
198}
199
200// -----
201
202// Check that we do not distribute writes larger than the maximum allowed
203// number of elements, or multiples of the maximum number of elements.
204
205// CHECK-D-LABEL: func @warp_extract_8_elems(
206//       CHECK-D:   arith.constant 0 : index
207//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
208//       CHECK-D:     %[[V:.+]] = "test.dummy_op"
209//       CHECK-D:     %[[V1:.+]] = "test.dummy_op"
210//       CHECK-D:     vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<8x1xf32>
211//       CHECK-D:     vector.transfer_write %[[V]], %{{.*}}[%{{.*}}] {{.*}} : vector<8xf32>
212//       CHECK-D:   }
213
214func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
215  gpu.warp_execute_on_lane_0(%laneid)[32] {
216    %c0 = arith.constant 0 : index
217    %v = "test.dummy_op"() : () -> (vector<8xf32>)
218    %v1 = "test.dummy_op"() : () -> (vector<8x1xf32>)
219    vector.transfer_write %v1, %arg1[%c0, %c0] : vector<8x1xf32>, memref<1024x1024xf32>
220    vector.transfer_write %v, %arg1[%c0, %c0] : vector<8xf32>, memref<1024x1024xf32>
221  }
222  return
223}
224
225// -----
226
227// CHECK-PROP-LABEL:   func @warp_dead_result(
228func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) {
229  // CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>)
230  %r:3 = gpu.warp_execute_on_lane_0(%laneid)[32] ->
231    (vector<1xf32>, vector<1xf32>, vector<1xf32>) {
232    %2 = "some_def"() : () -> (vector<32xf32>)
233    %3 = "some_def"() : () -> (vector<32xf32>)
234    %4 = "some_def"() : () -> (vector<32xf32>)
235  // CHECK-PROP:   gpu.yield %{{.*}} : vector<32xf32>
236    gpu.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32>
237  }
238  // CHECK-PROP: return %[[R]] : vector<1xf32>
239  return %r#1 : vector<1xf32>
240}
241
242// -----
243
244// CHECK-PROP-LABEL:   func @warp_propagate_operand(
245//  CHECK-PROP-SAME:   %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>)
246func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>)
247  -> (vector<4xf32>) {
248  %r = gpu.warp_execute_on_lane_0(%laneid)[32]
249     args(%v0 : vector<4xf32>) -> (vector<4xf32>) {
250     ^bb0(%arg0 : vector<128xf32>) :
251    gpu.yield %arg0 : vector<128xf32>
252  }
253  // CHECK-PROP: return %[[V]] : vector<4xf32>
254  return %r : vector<4xf32>
255}
256
257// -----
258
259#map0 = affine_map<()[s0] -> (s0 * 2)>
260
261// CHECK-PROP-LABEL:   func @warp_propagate_elementwise(
262func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) {
263  %c0 = arith.constant 0 : index
264  %c32 = arith.constant 0 : index
265  %cst = arith.constant 0.000000e+00 : f32
266  // CHECK-PROP: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>)
267  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->
268    (vector<1xf32>, vector<2xf32>) {
269    // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32>
270    // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32>
271    // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32>
272    // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32>
273    // CHECK-PROP: gpu.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32>
274    %2 = "some_def"() : () -> (vector<32xf32>)
275    %3 = "some_def"() : () -> (vector<32xf32>)
276    %4 = "some_def"() : () -> (vector<64xf32>)
277    %5 = "some_def"() : () -> (vector<64xf32>)
278    %6 = arith.addf %2, %3 : vector<32xf32>
279    %7 = arith.addf %4, %5 : vector<64xf32>
280    gpu.yield %6, %7 : vector<32xf32>, vector<64xf32>
281  }
282  // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32>
283  // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32>
284  %id2 = affine.apply #map0()[%laneid]
285  // CHECK-PROP: vector.transfer_write %[[A1]], {{.*}} : vector<1xf32>, memref<1024xf32>
286  // CHECK-PROP: vector.transfer_write %[[A0]], {{.*}} : vector<2xf32>, memref<1024xf32>
287  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
288  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
289  return
290}
291
292// -----
293
294// CHECK-PROP-LABEL: func @warp_propagate_scalar_arith(
295//       CHECK-PROP:   %[[r:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} {
296//       CHECK-PROP:     %[[some_def0:.*]] = "some_def"
297//       CHECK-PROP:     %[[some_def1:.*]] = "some_def"
298//       CHECK-PROP:     gpu.yield %[[some_def0]], %[[some_def1]]
299//       CHECK-PROP:   }
300//       CHECK-PROP:   arith.addf %[[r]]#0, %[[r]]#1 : f32
301func.func @warp_propagate_scalar_arith(%laneid: index) {
302  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
303    %0 = "some_def"() : () -> (f32)
304    %1 = "some_def"() : () -> (f32)
305    %2 = arith.addf %0, %1 : f32
306    gpu.yield %2 : f32
307  }
308  vector.print %r : f32
309  return
310}
311
312// -----
313
314// CHECK-PROP-LABEL: func @warp_propagate_cast(
315//   CHECK-PROP-NOT:   gpu.warp_execute_on_lane_0
316//       CHECK-PROP:   %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32
317//       CHECK-PROP:   return %[[result]]
318func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) {
319  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
320    %casted = arith.sitofp %i : i32 to f32
321    gpu.yield %casted : f32
322  }
323  return %r : f32
324}
325
326// -----
327
328#map0 = affine_map<()[s0] -> (s0 * 2)>
329
330//  CHECK-PROP-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 2)>
331
332// CHECK-PROP:   func @warp_propagate_read
333//  CHECK-PROP-SAME:     (%[[ID:.*]]: index
334func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: memref<1024xf32>) {
335// CHECK-PROP-NOT: warp_execute_on_lane_0
336// CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[ID]]], %{{.*}} : memref<1024xf32>, vector<1xf32>
337// CHECK-PROP-DAG: %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
338// CHECK-PROP-DAG: %[[R1:.*]] = vector.transfer_read %arg1[%[[ID2]]], %{{.*}} : memref<1024xf32>, vector<2xf32>
339// CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1xf32>, memref<1024xf32>
340// CHECK-PROP: vector.transfer_write %[[R1]], {{.*}} : vector<2xf32>, memref<1024xf32>
341  %c0 = arith.constant 0 : index
342  %c32 = arith.constant 0 : index
343  %cst = arith.constant 0.000000e+00 : f32
344  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) {
345    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
346    %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32>
347    gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
348  }
349  %id2 = affine.apply #map0()[%laneid]
350  vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
351  vector.transfer_write %r#1, %dest[%id2] : vector<2xf32>, memref<1024xf32>
352  return
353}
354
355// -----
356
357// CHECK-PROP-LABEL: func @fold_vector_broadcast(
358//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
359//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
360//       CHECK-PROP:     gpu.yield %[[some_def]] : vector<1xf32>
361//       CHECK-PROP:   vector.print %[[r]] : vector<1xf32>
362func.func @fold_vector_broadcast(%laneid: index) {
363  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
364    %0 = "some_def"() : () -> (vector<1xf32>)
365    %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32>
366    gpu.yield %1 : vector<32xf32>
367  }
368  vector.print %r : vector<1xf32>
369  return
370}
371
372// -----
373
374// CHECK-PROP-LABEL: func @extract_vector_broadcast(
375//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
376//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
377//       CHECK-PROP:     gpu.yield %[[some_def]] : vector<1xf32>
378//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32>
379//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
380func.func @extract_vector_broadcast(%laneid: index) {
381  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
382    %0 = "some_def"() : () -> (vector<1xf32>)
383    %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32>
384    gpu.yield %1 : vector<64xf32>
385  }
386  vector.print %r : vector<2xf32>
387  return
388}
389
390// -----
391
392// CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast(
393//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (f32)
394//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
395//       CHECK-PROP:     gpu.yield %[[some_def]] : f32
396//       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32>
397//       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
398func.func @extract_scalar_vector_broadcast(%laneid: index) {
399  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
400    %0 = "some_def"() : () -> (f32)
401    %1 = vector.broadcast %0 : f32 to vector<64xf32>
402    gpu.yield %1 : vector<64xf32>
403  }
404  vector.print %r : vector<2xf32>
405  return
406}
407
408// -----
409
410// CHECK-PROP-LABEL:   func @warp_scf_for(
411// CHECK-PROP: %[[INI:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) {
412// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
413// CHECK-PROP:   gpu.yield %[[INI1]] : vector<128xf32>
414// CHECK-PROP: }
415// CHECK-PROP: %[[F:.*]] = scf.for %[[IT:.+]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) {
416// CHECK-PROP:   %[[A:.*]] = arith.addi %[[IT]], %{{.*}} : index
417// CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) {
418// CHECK-PROP:    ^bb0(%[[ARG:.*]]: vector<128xf32>):
419// CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[A]], %[[ARG]]) : (index, vector<128xf32>) -> vector<128xf32>
420// CHECK-PROP:      gpu.yield %[[ACC]] : vector<128xf32>
421// CHECK-PROP:   }
422// CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
423// CHECK-PROP: }
424// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> ()
425func.func @warp_scf_for(%arg0: index) {
426  %c128 = arith.constant 128 : index
427  %c1 = arith.constant 1 : index
428  %c0 = arith.constant 0 : index
429  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
430    %ini = "some_def"() : () -> (vector<128xf32>)
431    %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
432      %add = arith.addi %arg3, %c1 : index
433      %acc = "some_def"(%add, %arg4) : (index, vector<128xf32>) -> (vector<128xf32>)
434      scf.yield %acc : vector<128xf32>
435    }
436    gpu.yield %3 : vector<128xf32>
437  }
438  "some_use"(%0) : (vector<4xf32>) -> ()
439  return
440}
441
442// -----
443
444// CHECK-PROP-LABEL:   func @warp_scf_for_use_from_above(
445// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
446// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
447// CHECK-PROP:   %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32>
448// CHECK-PROP:   gpu.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32>
449// CHECK-PROP: }
450// CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) {
451// CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) {
452// CHECK-PROP:    ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>):
453// CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32>
454// CHECK-PROP:      gpu.yield %[[ACC]] : vector<128xf32>
455// CHECK-PROP:   }
456// CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
457// CHECK-PROP: }
458// CHECK-PROP: "some_use"(%[[F]]) : (vector<4xf32>) -> ()
459func.func @warp_scf_for_use_from_above(%arg0: index) {
460  %c128 = arith.constant 128 : index
461  %c1 = arith.constant 1 : index
462  %c0 = arith.constant 0 : index
463  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
464    %ini = "some_def"() : () -> (vector<128xf32>)
465    %use_from_above = "some_def_above"() : () -> (vector<128xf32>)
466    %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
467      %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
468      scf.yield %acc : vector<128xf32>
469    }
470    gpu.yield %3 : vector<128xf32>
471  }
472  "some_use"(%0) : (vector<4xf32>) -> ()
473  return
474}
475
476// -----
477
478// CHECK-PROP-LABEL:   func @warp_scf_for_swap(
479// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
480// CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
481// CHECK-PROP:   %[[INI2:.*]] = "some_def"() : () -> vector<128xf32>
482// CHECK-PROP:   gpu.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32>
483// CHECK-PROP: }
484// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) {
485// CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
486// CHECK-PROP:    ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>):
487// CHECK-PROP:      %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32>
488// CHECK-PROP:      %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32>
489// CHECK-PROP:      gpu.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32>
490// CHECK-PROP:   }
491// CHECK-PROP:   scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32>
492// CHECK-PROP: }
493// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
494// CHECK-PROP: "some_use"(%[[F]]#1) : (vector<4xf32>) -> ()
495func.func @warp_scf_for_swap(%arg0: index) {
496  %c128 = arith.constant 128 : index
497  %c1 = arith.constant 1 : index
498  %c0 = arith.constant 0 : index
499  %0:2 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) {
500    %ini1 = "some_def"() : () -> (vector<128xf32>)
501    %ini2 = "some_def"() : () -> (vector<128xf32>)
502    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) {
503      %acc1 = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
504      %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>)
505      scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32>
506    }
507    gpu.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32>
508  }
509  "some_use"(%0#0) : (vector<4xf32>) -> ()
510  "some_use"(%0#1) : (vector<4xf32>) -> ()
511  return
512}
513
514// -----
515
516// CHECK-PROP-LABEL:   func @warp_scf_for_swap_no_yield(
517// CHECK-PROP:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
518// CHECK-PROP-NEXT:        gpu.warp_execute_on_lane_0(%{{.*}})[32] {
519// CHECK-PROP-NEXT:          "some_op"() : () -> ()
520// CHECK-PROP-NEXT:        }
521// CHECK-PROP-NEXT:      }
522func.func @warp_scf_for_swap_no_yield(%arg0: index) {
523  %c128 = arith.constant 128 : index
524  %c1 = arith.constant 1 : index
525  %c0 = arith.constant 0 : index
526  gpu.warp_execute_on_lane_0(%arg0)[32] {
527    scf.for %arg3 = %c0 to %c128 step %c1 {
528      "some_op"() : () -> ()
529    }
530  }
531  return
532}
533
534// -----
535
536#map = affine_map<()[s0] -> (s0 * 4)>
537#map1 = affine_map<()[s0] -> (s0 * 128 + 128)>
538#map2 = affine_map<()[s0] -> (s0 * 4 + 128)>
539
540// CHECK-PROP-LABEL:   func @warp_scf_for_multiple_yield(
541//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
542//  CHECK-PROP-NEXT:     "some_def"() : () -> vector<32xf32>
543//  CHECK-PROP-NEXT:     gpu.yield %{{.*}} : vector<32xf32>
544//  CHECK-PROP-NEXT:   }
545//   CHECK-PROP-NOT:   gpu.warp_execute_on_lane_0
546//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
547//       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
548//       CHECK-PROP:   %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) {
549//   CHECK-PROP-NOT:     gpu.warp_execute_on_lane_0
550//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
551//       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
552//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
553//       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
554//       CHECK-PROP:     scf.yield {{.*}} : vector<4xf32>, vector<4xf32>
555//       CHECK-PROP:   }
556func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2: memref<?xf32>) {
557  %c256 = arith.constant 256 : index
558  %c128 = arith.constant 128 : index
559  %c1 = arith.constant 1 : index
560  %c0 = arith.constant 0 : index
561  %cst = arith.constant 0.000000e+00 : f32
562  %0:3 = gpu.warp_execute_on_lane_0(%arg0)[32] ->
563  (vector<1xf32>, vector<4xf32>, vector<4xf32>) {
564    %def = "some_def"() : () -> (vector<32xf32>)
565    %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
566    %r2 = vector.transfer_read %arg2[%c128], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
567    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %r1, %arg5 = %r2)
568    -> (vector<128xf32>, vector<128xf32>) {
569      %o1 = affine.apply #map1()[%arg3]
570      %o2 = affine.apply #map2()[%arg3]
571      %4 = vector.transfer_read %arg1[%o1], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
572      %5 = vector.transfer_read %arg1[%o2], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
573      %6 = arith.addf %4, %arg4 : vector<128xf32>
574      %7 = arith.addf %5, %arg5 : vector<128xf32>
575      scf.yield %6, %7 : vector<128xf32>, vector<128xf32>
576    }
577    gpu.yield %def, %3#0, %3#1 :  vector<32xf32>, vector<128xf32>, vector<128xf32>
578  }
579  %1 = affine.apply #map()[%arg0]
580  vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
581  %2 = affine.apply #map2()[%arg0]
582  vector.transfer_write %0#2, %arg2[%2] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
583  "some_use"(%0#0) : (vector<1xf32>) -> ()
584  return
585}
586
587// -----
588
589// CHECK-PROP-LABEL: func @vector_reduction(
590//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
591//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
592//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
593//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
594//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
595//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
596//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
597//       CHECK-PROP:   %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
598//       CHECK-PROP:     gpu.yield %{{.*}} : vector<32xf32>
599//       CHECK-PROP:   }
600//       CHECK-PROP:   %[[a:.*]] = vector.extract %[[warp_op]][0] : f32 from vector<1xf32>
601//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
602//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
603//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
604//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
605//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
606//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
607//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
608//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
609//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
610//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
611//       CHECK-PROP:   return %[[a4]] : f32
612func.func @vector_reduction(%laneid: index) -> (f32) {
613  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
614    %0 = "some_def"() : () -> (vector<32xf32>)
615    %1 = vector.reduction <add>, %0 : vector<32xf32> into f32
616    gpu.yield %1 : f32
617  }
618  return %r : f32
619}
620
621// -----
622
623// CHECK-PROP-LABEL: func @warp_distribute(
624//  CHECK-PROP-SAME:    %[[ID:[a-zA-Z0-9]+]]
625//  CHECK-PROP-SAME:    %[[SRC:[a-zA-Z0-9]+]]
626//  CHECK-PROP-SAME:    %[[DEST:[a-zA-Z0-9]+]]
627//       CHECK-PROP:    gpu.warp_execute_on_lane_0(%[[ID]])[32]
628//  CHECK-PROP-NEXT:      "some_def"() : () -> vector<4096xf32>
629//  CHECK-PROP-NEXT:      %{{.*}} = vector.reduction
630//       CHECK-PROP:      %[[DEF:.*]] = arith.divf %{{.*}}, %{{.*}} : vector<1xf32>
631//   CHECK-PROP-NOT:      gpu.warp_execute_on_lane_0
632//       CHECK-PROP:      scf.for
633//       CHECK-PROP:        %{{.*}} = arith.subf %{{.*}}, %[[DEF]] : vector<1xf32>
634func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<128xf32>){
635  %cst = arith.constant 0.000000e+00 : f32
636  %c0 = arith.constant 0 : index
637  %c1 = arith.constant 1 : index
638  %c128 = arith.constant 128 : index
639  %f0 = arith.constant 0.000000e+00 : f32
640  gpu.warp_execute_on_lane_0(%arg0)[32]{
641    %cst_1 = arith.constant dense<2.621440e+05> : vector<1xf32>
642    %0 = "some_def"() : () -> (vector<4096xf32>)
643    %1 = vector.reduction <add>, %0, %cst : vector<4096xf32> into f32
644    %2 = vector.broadcast %1 : f32 to vector<1xf32>
645    %3 = arith.divf %2, %cst_1 : vector<1xf32>
646    scf.for %arg1 = %c0 to %c128 step %c1 {
647        %4 = vector.transfer_read %src[%arg1], %f0 {in_bounds = [true]} : memref<128xf32>, vector<1xf32>
648        %5 = arith.subf %4, %3 : vector<1xf32>
649        vector.transfer_write %5, %dest[%arg1] : vector<1xf32>, memref<128xf32>
650    }
651  }
652  return
653}
654
655// -----
656
657func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref<f32>) {
658  %c0 = arith.constant 0: index
659  %f0 = arith.constant 0.0: f32
660  //     CHECK-D: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
661  //     CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] {
662  //     CHECK-D:   vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32>
663  gpu.warp_execute_on_lane_0(%laneid)[32] {
664    %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32>
665    %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32>
666    %2 = vector.extractelement %1[] : vector<f32>
667    %3 = vector.reduction <add>, %0 : vector<32xf32> into f32
668    %4 = arith.addf %3, %2 : f32
669    %5 = vector.broadcast %4 : f32 to vector<f32>
670    vector.transfer_write %5, %m1[] : vector<f32>, memref<f32>
671  }
672  return
673}
674
675// -----
676
677// CHECK-PROP-LABEL: func @vector_reduction_large(
678//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
679//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
680//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
681//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
682//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
683//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
684//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
685//       CHECK-PROP:   %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) {
686//       CHECK-PROP:     gpu.yield %{{.*}} : vector<64xf32>
687//       CHECK-PROP:   }
688//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]] : vector<2xf32> into f32
689//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
690//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
691//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
692//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
693//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
694//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
695//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
696//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
697//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
698//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
699//       CHECK-PROP:   return %[[a4]] : f32
700func.func @vector_reduction_large(%laneid: index) -> (f32) {
701  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
702    %0 = "some_def"() : () -> (vector<64xf32>)
703    %1 = vector.reduction <add>, %0 : vector<64xf32> into f32
704    gpu.yield %1 : f32
705  }
706  return %r : f32
707}
708
709// -----
710
711// CHECK-PROP-LABEL: func @vector_reduction_acc(
712//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
713//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
714//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
715//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
716//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
717//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
718//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
719//       CHECK-PROP:   %[[warp_op:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) {
720//       CHECK-PROP:     gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32
721//       CHECK-PROP:   }
722//       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]]#0 : vector<2xf32> into f32
723//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
724//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
725//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
726//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
727//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
728//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
729//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
730//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
731//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
732//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
733//       CHECK-PROP:   %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1
734//       CHECK-PROP:   return %[[a5]] : f32
735func.func @vector_reduction_acc(%laneid: index) -> (f32) {
736  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
737    %0 = "some_def"() : () -> (vector<64xf32>)
738    %1 = "some_def"() : () -> (f32)
739    %2 = vector.reduction <add>, %0, %1 : vector<64xf32> into f32
740    gpu.yield %2 : f32
741  }
742  return %r : f32
743}
744
745// -----
746
747// CHECK-PROP-LABEL:   func @warp_duplicate_yield(
748func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) {
749  //   CHECK-PROP: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>)
750  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) {
751    %2 = "some_def"() : () -> (vector<32xf32>)
752    %3 = "some_def"() : () -> (vector<32xf32>)
753    %4 = arith.addf %2, %3 : vector<32xf32>
754    %5 = arith.addf %2, %2 : vector<32xf32>
755// CHECK-PROP-NOT:   arith.addf
756//     CHECK-PROP:   gpu.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32>
757    gpu.yield %4, %5 : vector<32xf32>, vector<32xf32>
758  }
759  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
760}
761
762// -----
763
764// CHECK-PROP-LABEL: func @warp_constant(
765//       CHECK-PROP:   %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32>
766//       CHECK-PROP:   return %[[C]] : vector<1xf32>
767func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
768  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
769    %cst = arith.constant dense<2.0> : vector<32xf32>
770    gpu.yield %cst : vector<32xf32>
771  }
772  return %r : vector<1xf32>
773}
774
775// -----
776
777// TODO: We could use warp shuffles instead of broadcasting the entire vector.
778
779// CHECK-PROP-LABEL: func.func @vector_extract_1d(
780//   CHECK-PROP-DAG:   %[[C5_I32:.*]] = arith.constant 5 : i32
781//   CHECK-PROP-DAG:   %[[C1:.*]] = arith.constant 1 : index
782//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) {
783//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<64xf32>
784//       CHECK-PROP:     gpu.yield %[[V]] : vector<64xf32>
785//       CHECK-PROP:   }
786//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32>
787//       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[E]], %[[C5_I32]]
788//       CHECK-PROP:   return %[[SHUFFLED]] : f32
789func.func @vector_extract_1d(%laneid: index) -> (f32) {
790  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
791    %0 = "some_def"() : () -> (vector<64xf32>)
792    %1 = vector.extract %0[9] : f32 from vector<64xf32>
793    gpu.yield %1 : f32
794  }
795  return %r : f32
796}
797
798// -----
799
800// CHECK-PROP-LABEL: func.func @vector_extract_2d(
801//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) {
802//       CHECK-PROP:     %[[V:.*]] = "some_def"
803//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
804//       CHECK-PROP:   }
805//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<3xf32> from vector<5x3xf32>
806//       CHECK-PROP:   return %[[E]]
807func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) {
808  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
809    %0 = "some_def"() : () -> (vector<5x96xf32>)
810    %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32>
811    gpu.yield %1 : vector<96xf32>
812  }
813  return %r : vector<3xf32>
814}
815
816// -----
817
818// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast_scalar(
819//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
820//       CHECK-PROP:     %[[V:.*]] = "some_def"
821//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
822//       CHECK-PROP:   }
823//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][1, 2] : f32 from vector<5x96xf32>
824//       CHECK-PROP:   return %[[E]]
825func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) {
826  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
827    %0 = "some_def"() : () -> (vector<5x96xf32>)
828    %1 = vector.extract %0[1, 2] : f32 from vector<5x96xf32>
829    gpu.yield %1 : f32
830  }
831  return %r : f32
832}
833
834// -----
835
836// CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast(
837//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
838//       CHECK-PROP:     %[[V:.*]] = "some_def"
839//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
840//       CHECK-PROP:   }
841//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<96xf32> from vector<5x96xf32>
842//       CHECK-PROP:   return %[[E]]
843func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) {
844  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
845    %0 = "some_def"() : () -> (vector<5x96xf32>)
846    %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32>
847    gpu.yield %1 : vector<96xf32>
848  }
849  return %r : vector<96xf32>
850}
851
852// -----
853
854// CHECK-PROP-LABEL: func.func @vector_extract_3d(
855//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) {
856//       CHECK-PROP:     %[[V:.*]] = "some_def"
857//       CHECK-PROP:     gpu.yield %[[V]] : vector<8x128x96xf32>
858//       CHECK-PROP:   }
859//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<4x96xf32> from vector<8x4x96xf32>
860//       CHECK-PROP:   return %[[E]]
861func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) {
862  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
863    %0 = "some_def"() : () -> (vector<8x128x96xf32>)
864    %1 = vector.extract %0[2] : vector<128x96xf32> from vector<8x128x96xf32>
865    gpu.yield %1 : vector<128x96xf32>
866  }
867  return %r : vector<4x96xf32>
868}
869
870// -----
871
872// CHECK-PROP-LABEL: func.func @vector_extractelement_0d(
873//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
874//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<f32>
875//       CHECK-PROP:     gpu.yield %[[V]] : vector<f32>
876//       CHECK-PROP:   }
877//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][] : f32 from vector<f32>
878//       CHECK-PROP:   return %[[E]] : f32
879func.func @vector_extractelement_0d(%laneid: index) -> (f32) {
880  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
881    %0 = "some_def"() : () -> (vector<f32>)
882    %1 = vector.extractelement %0[] : vector<f32>
883    gpu.yield %1 : f32
884  }
885  return %r : f32
886}
887
888// -----
889
890// CHECK-PROP-LABEL: func.func @vector_extractelement_1element(
891//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
892//       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<1xf32>
893//       CHECK-PROP:     gpu.yield %[[V]] : vector<1xf32>
894//       CHECK-PROP:   }
895//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][0] : f32 from vector<1xf32>
896//       CHECK-PROP:   return %[[E]] : f32
897func.func @vector_extractelement_1element(%laneid: index) -> (f32) {
898  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
899    %0 = "some_def"() : () -> (vector<1xf32>)
900    %c0 = arith.constant 0 : index
901    %1 = vector.extractelement %0[%c0 : index] : vector<1xf32>
902    gpu.yield %1 : f32
903  }
904  return %r : f32
905}
906
907// -----
908
909//       CHECK-PROP: #[[$map:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)>
910//       CHECK-PROP: #[[$map1:.*]] = affine_map<()[s0] -> (s0 mod 3)>
911// CHECK-PROP-LABEL: func.func @vector_extractelement_1d(
912//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
913//   CHECK-PROP-DAG:   %[[C32:.*]] = arith.constant 32 : i32
914//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) {
915//       CHECK-PROP:     %[[V:.*]] = "some_def"
916//       CHECK-PROP:     gpu.yield %[[V]] : vector<96xf32>
917//       CHECK-PROP:   }
918//       CHECK-PROP:   %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]]
919//       CHECK-PROP:   %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]]
920//       CHECK-PROP:   %[[EXTRACTED:.*]] = vector.extract %[[W]][%[[DISTR_POS]]] : f32 from vector<3xf32>
921//       CHECK-PROP:   %[[FROM_LANE_I32:.*]] = arith.index_cast %[[FROM_LANE]] : index to i32
922//       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32
923//       CHECK-PROP:   return %[[SHUFFLED]]
924func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) {
925  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
926    %0 = "some_def"() : () -> (vector<96xf32>)
927    %1 = vector.extractelement %0[%pos : index] : vector<96xf32>
928    gpu.yield %1 : f32
929  }
930  return %r : f32
931}
932
933// -----
934
935// Index-typed values cannot be shuffled at the moment.
936
937// CHECK-PROP-LABEL: func.func @vector_extractelement_1d_index(
938//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (index) {
939//       CHECK-PROP:     "some_def"
940//       CHECK-PROP:     vector.extract
941//       CHECK-PROP:     gpu.yield {{.*}} : index
942//       CHECK-PROP:   }
943func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index) {
944  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (index) {
945    %0 = "some_def"() : () -> (vector<96xindex>)
946    %1 = vector.extractelement %0[%pos : index] : vector<96xindex>
947    gpu.yield %1 : index
948  }
949  return %r : index
950}
951
952// -----
953
954// CHECK-PROP:   func @lane_dependent_warp_propagate_read
955//  CHECK-PROP-SAME:   %[[ID:.*]]: index
956func.func @lane_dependent_warp_propagate_read(
957    %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) {
958  // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index
959  // CHECK-PROP-NOT: gpu.warp_execute_on_lane_0
960  // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32>
961  // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32>
962  %c0 = arith.constant 0 : index
963  %cst = arith.constant 0.000000e+00 : f32
964  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) {
965    %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32>
966    gpu.yield %2 : vector<1x32xf32>
967  }
968  vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32>
969  return
970}
971
972// -----
973
974func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) -> vector<1x1x4xf32> {
975  %c0 = arith.constant 0 : index
976  %cst = arith.constant 0.000000e+00 : f32
977  %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) {
978    %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32>
979    gpu.yield %2 : vector<32x4x32xf32>
980  }
981  return %r : vector<1x1x4xf32>
982}
983
984//   CHECK-PROP-DAG: #[[$ID0MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>
985//   CHECK-PROP-DAG: #[[$ID1MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) mod 4)>
986//   CHECK-PROP-DAG: #[[$ID2MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) floordiv 32)>
987// CHECK-PROP-LABEL: func.func @warp_propagate_read_3d
988//  CHECK-PROP-SAME: (%[[LANE:.+]]: index, %[[SRC:.+]]: memref<32x4x32xf32>)
989//   CHECK-PROP-DAG: %[[ID0:.+]] = affine.apply #[[$ID0MAP]]()[%[[LANE]]]
990//   CHECK-PROP-DAG: %[[ID1:.+]] = affine.apply #[[$ID1MAP]]()[%[[LANE]]]
991//   CHECK-PROP-DAG: %[[ID2:.+]] = affine.apply #[[$ID2MAP]]()[%[[LANE]]]
992//       CHECK-PROP: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[ID2]], %[[ID1]], %[[ID0]]], %{{.+}} : memref<32x4x32xf32>, vector<1x1x4xf32>
993//       CHECK-PROP: return %[[READ]] : vector<1x1x4xf32>
994
995// -----
996
997func.func @warp_propagate_read_broadcast(%laneid: index, %src: memref<32x1xf32>) -> vector<1x4xf32> {
998  %c0 = arith.constant 0 : index
999  %cst = arith.constant 0.000000e+00 : f32
1000  %r = gpu.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) {
1001    %2 = vector.transfer_read %src[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0)>} : memref<32x1xf32>, vector<32x64xf32>
1002    gpu.yield %2 : vector<32x64xf32>
1003  }
1004  return %r : vector<1x4xf32>
1005}
1006
1007//   CHECK-PROP-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 16)>
1008//   CHECK-PROP-DAG: #[[$READMAP:.+]] = affine_map<(d0, d1) -> (d0, 0)>
1009// CHECK-PROP-LABEL: func.func @warp_propagate_read_broadcast
1010//  CHECK-PROP-SAME: (%[[LANE:.+]]: index, %[[SRC:.+]]: memref<32x1xf32>)
1011//       CHECK-PROP:  %[[C0:.+]] = arith.constant 0 : index
1012//       CHECK-PROP:  %[[ID:.+]] = affine.apply #[[$MAP]]()[%[[LANE]]]
1013//       CHECK-PROP:  %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[ID]], %[[C0]]], %{{.+}} {in_bounds = [true, true], permutation_map = #[[$READMAP]]} : memref<32x1xf32>, vector<1x4xf32>
1014//       CHECK-PROP:  return %[[READ]] : vector<1x4xf32>
1015
1016// -----
1017
1018// CHECK-PROP:   func @dont_duplicate_read
1019func.func @dont_duplicate_read(
1020  %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> {
1021  %c0 = arith.constant 0 : index
1022  %cst = arith.constant 0.000000e+00 : f32
1023//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
1024//  CHECK-PROP-NEXT:     vector.transfer_read
1025//  CHECK-PROP-NEXT:     "blocking_use"
1026//  CHECK-PROP-NEXT:     gpu.yield
1027  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
1028    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
1029    "blocking_use"(%2) : (vector<32xf32>) -> ()
1030    gpu.yield %2 : vector<32xf32>
1031  }
1032  return %r : vector<1xf32>
1033}
1034
1035// -----
1036
1037// CHECK-PROP:   func @dedup
1038func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>)
1039    -> (vector<1xf32>, vector<1xf32>) {
1040
1041  // CHECK-PROP: %[[SINGLE_RES:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) {
1042  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
1043      args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) {
1044    ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>):
1045
1046    // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32>
1047    %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
1048
1049    // CHECK-PROP: gpu.yield %[[SINGLE_VAL]] : vector<32xf32>
1050    gpu.yield %2, %2 : vector<32xf32>, vector<32xf32>
1051  }
1052
1053  // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32>
1054  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
1055}
1056
1057// -----
1058
1059// CHECK-SCF-IF:   func @warp_execute_has_broadcast_semantics
1060func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: vector<f32>, %v1: vector<1xf32>, %v2: vector<1x1xf32>)
1061    -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
1062  // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index
1063
1064  // CHECK-SCF-IF: scf.if{{.*}}{
1065  %r:4 = gpu.warp_execute_on_lane_0(%laneid)[32]
1066      args(%s0, %v0, %v1, %v2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
1067    ^bb0(%bs0: f32, %bv0: vector<f32>, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>):
1068
1069      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
1070      // CHECK-SCF-IF: vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
1071      // CHECK-SCF-IF: vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
1072      // CHECK-SCF-IF: memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
1073      // CHECK-SCF-IF: "some_def_0"(%{{.*}}) : (f32) -> f32
1074      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<f32>) -> vector<f32>
1075      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
1076      // CHECK-SCF-IF: "some_def_1"(%{{.*}}) : (vector<1x1xf32>) -> vector<1x1xf32>
1077      // CHECK-SCF-IF: memref.store {{.*}}[%[[C0]]] : memref<1xf32, 3>
1078      // CHECK-SCF-IF: vector.transfer_write {{.*}}[] : vector<f32>, memref<f32, 3>
1079      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]]] {in_bounds = [true]} : vector<1xf32>, memref<1xf32, 3>
1080      // CHECK-SCF-IF: vector.transfer_write {{.*}}[%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<1x1xf32>, memref<1x1xf32, 3>
1081
1082      %rs0 = "some_def_0"(%bs0) : (f32) -> f32
1083      %rv0 = "some_def_1"(%bv0) : (vector<f32>) -> vector<f32>
1084      %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32>
1085      %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32>
1086
1087      // CHECK-SCF-IF-NOT: gpu.yield
1088      gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
1089  }
1090
1091  // CHECK-SCF-IF: gpu.barrier
1092  // CHECK-SCF-IF: %[[RV2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]]{{.*}} {in_bounds = [true, true]} : memref<1x1xf32, 3>, vector<1x1xf32>
1093  // CHECK-SCF-IF: %[[RV1:.*]] = vector.transfer_read {{.*}}[%[[C0]]]{{.*}} {in_bounds = [true]} : memref<1xf32, 3>, vector<1xf32>
1094  // CHECK-SCF-IF: %[[RV0:.*]] = vector.transfer_read {{.*}}[]{{.*}} : memref<f32, 3>, vector<f32>
1095  // CHECK-SCF-IF: %[[RS0:.*]] = memref.load {{.*}}[%[[C0]]] : memref<1xf32, 3>
1096  // CHECK-SCF-IF: return %[[RS0]], %[[RV0]], %[[RV1]], %[[RV2]] : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
1097  return %r#0, %r#1, %r#2, %r#3 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
1098}
1099
1100// -----
1101
1102// CHECK-SCF-IF-DAG: #[[$TIMES2:.*]] = affine_map<()[s0] -> (s0 * 2)>
1103
1104// CHECK-SCF-IF:   func @warp_execute_nd_distribute
1105// CHECK-SCF-IF-SAME: (%[[LANEID:.*]]: index
1106func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %v1: vector<1x2x128xf32>)
1107    -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
1108  // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index
1109
1110  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEID]], %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x1xf32>, memref<32x64x1xf32, 3>
1111  // CHECK-SCF-IF:  %[[RID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
1112  // CHECK-SCF-IF:  vector.transfer_write %{{.*}}, %{{.*}}[%[[C0]], %[[RID]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x2x128xf32>, memref<1x64x128xf32, 3>
1113  // CHECK-SCF-IF:  gpu.barrier
1114
1115  // CHECK-SCF-IF: scf.if{{.*}}{
1116  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
1117      args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
1118    ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>):
1119
1120  // CHECK-SCF-IF-DAG: %[[SR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<32x64x1xf32>
1121  // CHECK-SCF-IF-DAG: %[[SR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x64x128xf32>
1122  //     CHECK-SCF-IF: %[[W0:.*]] = "some_def_0"(%[[SR0]]) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
1123  //     CHECK-SCF-IF: %[[W1:.*]] = "some_def_1"(%[[SR1]]) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
1124  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<32x64x1xf32>, memref<32x64x1xf32, 3>
1125  // CHECK-SCF-IF-DAG: vector.transfer_write %[[W1]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x64x128xf32>, memref<1x64x128xf32, 3>
1126
1127      %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
1128      %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
1129
1130      // CHECK-SCF-IF-NOT: gpu.yield
1131      gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
1132  }
1133
1134  //     CHECK-SCF-IF: gpu.barrier
1135  //     CHECK-SCF-IF: %[[WID:.*]] = affine.apply #[[$TIMES2]]()[%[[LANEID]]]
1136  // CHECK-SCF-IF-DAG: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[LANEID]], %[[C0]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<32x64x1xf32, 3>, vector<1x64x1xf32>
1137  // CHECK-SCF-IF-DAG: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[WID]], %[[C0]]], %cst {in_bounds = [true, true, true]} : memref<1x64x128xf32, 3>, vector<1x2x128xf32>
1138  //     CHECK-SCF-IF: return %[[R0]], %[[R1]] : vector<1x64x1xf32>, vector<1x2x128xf32>
1139  return %r#0, %r#1 : vector<1x64x1xf32>, vector<1x2x128xf32>
1140}
1141
1142// -----
1143
1144//       CHECK-PROP:   #[[$MAP:.*]] = affine_map<()[s0] -> (s0 ceildiv 3)>
1145//       CHECK-PROP:   #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 3)>
1146// CHECK-PROP-LABEL: func @vector_insertelement_1d(
1147//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
1148//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
1149//       CHECK-PROP:   %[[INSERTING_LANE:.*]] = affine.apply #[[$MAP]]()[%[[POS]]]
1150//       CHECK-PROP:   %[[INSERTING_POS:.*]] = affine.apply #[[$MAP1]]()[%[[POS]]]
1151//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[INSERTING_LANE]] : index
1152//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) {
1153//       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [%[[INSERTING_POS]]]
1154//       CHECK-PROP:     scf.yield %[[INSERT]]
1155//       CHECK-PROP:   } else {
1156//       CHECK-PROP:     scf.yield %[[W]]#0
1157//       CHECK-PROP:   }
1158//       CHECK-PROP:   return %[[R]]
1159func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32>) {
1160  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
1161    %0 = "some_def"() : () -> (vector<96xf32>)
1162    %f = "another_def"() : () -> (f32)
1163    %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
1164    gpu.yield %1 : vector<96xf32>
1165  }
1166  return %r : vector<3xf32>
1167}
1168
1169// -----
1170
1171// CHECK-PROP-LABEL: func @vector_insertelement_1d_broadcast(
1172//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
1173//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32)
1174//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1175//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1176//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
1177//       CHECK-PROP:   vector.insert %[[W]]#1, %[[W]]#0 [%[[POS]]] : f32 into vector<96xf32>
1178func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (vector<96xf32>) {
1179  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
1180    %0 = "some_def"() : () -> (vector<96xf32>)
1181    %f = "another_def"() : () -> (f32)
1182    %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
1183    gpu.yield %1 : vector<96xf32>
1184  }
1185  return %r : vector<96xf32>
1186}
1187
1188// -----
1189
1190// CHECK-PROP-LABEL: func @vector_insertelement_0d(
1191//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<f32>, f32)
1192//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1193//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1194//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
1195//       CHECK-PROP:   vector.insert %[[W]]#1, %[[W]]#0 [] : f32 into vector<f32>
1196func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) {
1197  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<f32>) {
1198    %0 = "some_def"() : () -> (vector<f32>)
1199    %f = "another_def"() : () -> (f32)
1200    %1 = vector.insertelement %f, %0[] : vector<f32>
1201    gpu.yield %1 : vector<f32>
1202  }
1203  return %r : vector<f32>
1204}
1205
1206// -----
1207
1208// CHECK-PROP-LABEL: func @vector_insert_1d(
1209//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
1210//   CHECK-PROP-DAG:   %[[C26:.*]] = arith.constant 26 : index
1211//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
1212//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1213//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1214//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
1215//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C26]]
1216//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) {
1217//       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [1]
1218//       CHECK-PROP:     scf.yield %[[INSERT]]
1219//       CHECK-PROP:   } else {
1220//       CHECK-PROP:     scf.yield %[[W]]#0
1221//       CHECK-PROP:   }
1222//       CHECK-PROP:   return %[[R]]
1223func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) {
1224  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
1225    %0 = "some_def"() : () -> (vector<96xf32>)
1226    %f = "another_def"() : () -> (f32)
1227    %1 = vector.insert %f, %0[76] : f32 into vector<96xf32>
1228    gpu.yield %1 : vector<96xf32>
1229  }
1230  return %r : vector<3xf32>
1231}
1232
1233// -----
1234
1235// CHECK-PROP-LABEL: func @vector_insert_2d_distr_src(
1236//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
1237//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>)
1238//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1239//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1240//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
1241//       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<3xf32> into vector<4x3xf32>
1242//       CHECK-PROP:   return %[[INSERT]]
1243func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) {
1244  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) {
1245    %0 = "some_def"() : () -> (vector<4x96xf32>)
1246    %s = "another_def"() : () -> (vector<96xf32>)
1247    %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
1248    gpu.yield %1 : vector<4x96xf32>
1249  }
1250  return %r : vector<4x3xf32>
1251}
1252
1253// -----
1254
1255// CHECK-PROP-LABEL: func @vector_insert_2d_distr_pos(
1256//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
1257//       CHECK-PROP:   %[[C19:.*]] = arith.constant 19 : index
1258//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
1259//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1260//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1261//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
1262//       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C19]]
1263//       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<4x96xf32>) {
1264//       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [3] : vector<96xf32> into vector<4x96xf32>
1265//       CHECK-PROP:     scf.yield %[[INSERT]]
1266//       CHECK-PROP:   } else {
1267//       CHECK-PROP:     scf.yield %[[W]]#1
1268//       CHECK-PROP:   }
1269//       CHECK-PROP:   return %[[R]]
1270func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) {
1271  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
1272    %0 = "some_def"() : () -> (vector<128x96xf32>)
1273    %s = "another_def"() : () -> (vector<96xf32>)
1274    %1 = vector.insert %s, %0[79] : vector<96xf32> into vector<128x96xf32>
1275    gpu.yield %1 : vector<128x96xf32>
1276  }
1277  return %r : vector<4x96xf32>
1278}
1279
1280// -----
1281
1282// CHECK-PROP-LABEL: func @vector_insert_2d_broadcast(
1283//  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
1284//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
1285//       CHECK-PROP:     %[[VEC:.*]] = "some_def"
1286//       CHECK-PROP:     %[[VAL:.*]] = "another_def"
1287//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
1288//       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<96xf32> into vector<4x96xf32>
1289//       CHECK-PROP:   return %[[INSERT]]
1290func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) {
1291  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
1292    %0 = "some_def"() : () -> (vector<4x96xf32>)
1293    %s = "another_def"() : () -> (vector<96xf32>)
1294    %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
1295    gpu.yield %1 : vector<4x96xf32>
1296  }
1297  return %r : vector<4x96xf32>
1298}
1299
1300// -----
1301
1302// Make sure that all operands of the transfer_read op are properly propagated.
1303// The vector.extractelement op cannot be propagated because index-typed
1304// shuffles are not supported at the moment.
1305
1306// CHECK-PROP: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 2)>
1307// CHECK-PROP-LABEL: func @transfer_read_prop_operands(
1308//  CHECK-PROP-SAME:     %[[IN2:[^ :]*]]: vector<1x2xindex>,
1309//  CHECK-PROP-SAME:     %[[AR1:[^ :]*]]: memref<1x4x2xi32>,
1310//  CHECK-PROP-SAME:     %[[AR2:[^ :]*]]: memref<1x4x1024xf32>)
1311//   CHECK-PROP-DAG:   %[[C0:.*]] = arith.constant 0 : index
1312//   CHECK-PROP-DAG:   %[[THREADID:.*]] = gpu.thread_id  x
1313//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]]
1314//       CHECK-PROP:     %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}]
1315//       CHECK-PROP:     %[[EXTRACT:.*]] = vector.extract %[[GATHER]][0] : vector<64xi32> from vector<1x64xi32>
1316//       CHECK-PROP:     %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex>
1317//       CHECK-PROP:     %[[EXTRACTELT:.*]] = vector.extract %[[CAST]][{{.*}}] : index from vector<64xindex>
1318//       CHECK-PROP:     gpu.yield %[[EXTRACTELT]] : index
1319//       CHECK-PROP:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[THREADID]]]
1320//       CHECK-PROP:   %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[W]], %[[APPLY]]],
1321//       CHECK-PROP:   return %[[TRANSFERREAD]]
1322func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 :  memref<1x4x2xi32>, %ar2 : memref<1x4x1024xf32>)-> vector<2xf32> {
1323  %0 = gpu.thread_id  x
1324  %c0_i32 = arith.constant 0 : index
1325  %c0 = arith.constant 0 : index
1326  %cst = arith.constant dense<0> : vector<1x64xi32>
1327  %cst_0 = arith.constant dense<true> : vector<1x64xi1>
1328  %cst_1 = arith.constant dense<3> : vector<64xindex>
1329  %cst_2 = arith.constant dense<0> : vector<64xindex>
1330  %cst_6 = arith.constant 0.000000e+00 : f32
1331
1332  %18 = gpu.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) {
1333  ^bb0(%arg4: vector<1x64xindex>):
1334    %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32>
1335    %29 = vector.extract %28[0] : vector<64xi32> from vector<1x64xi32>
1336    %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex>
1337    %36 = vector.extractelement %30[%c0_i32 : index] : vector<64xindex>
1338    %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32>
1339    gpu.yield %37 : vector<64xf32>
1340  }
1341  return %18 : vector<2xf32>
1342}
1343
1344// -----
1345
1346// Check that we don't fold vector.broadcast when each thread doesn't get the
1347// same value.
1348
1349// CHECK-PROP-LABEL: func @dont_fold_vector_broadcast(
1350//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>)
1351//       CHECK-PROP:     %[[some_def:.*]] = "some_def"
1352//       CHECK-PROP:     %[[broadcast:.*]] = vector.broadcast %[[some_def]] : vector<64xf32> to vector<1x64xf32>
1353//       CHECK-PROP:     gpu.yield %[[broadcast]] : vector<1x64xf32>
1354//       CHECK-PROP:   vector.print %[[r]] : vector<1x2xf32>
1355func.func @dont_fold_vector_broadcast(%laneid: index) {
1356  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
1357    %0 = "some_def"() : () -> (vector<64xf32>)
1358    %1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32>
1359    gpu.yield %1 : vector<1x64xf32>
1360  }
1361  vector.print %r : vector<1x2xf32>
1362  return
1363}
1364
1365// -----
1366
1367func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>) -> vector<4xf32> {
1368  %c0 = arith.constant 0 : index
1369  %cst = arith.constant 0.000000e+00 : f32
1370  %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) {
1371    %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32>
1372    %3 = vector.shape_cast %2 : vector<32x4x32xf32> to vector<4096xf32>
1373    gpu.yield %3 : vector<4096xf32>
1374  }
1375  return %r : vector<4xf32>
1376}
1377
1378// CHECK-PROP-LABEL: func.func @warp_propagate_shape_cast
1379// CHECK-PROP:   %[[READ:.+]] = vector.transfer_read {{.+}} : memref<32x4x32xf32>, vector<1x1x4xf32>
1380// CHECK-PROP:   %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<1x1x4xf32> to vector<4xf32>
1381// CHECK-PROP:   return %[[CAST]] : vector<4xf32>
1382
1383// -----
1384
1385func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index) -> vector<1xf32> {
1386  %f0 = arith.constant 0.000000e+00 : f32
1387  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
1388    %1 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
1389    gpu.yield %1 : vector<1xf32>
1390  }
1391  return %r : vector<1xf32>
1392}
1393
1394// CHECK-PROP-LABEL: func.func @warp_propagate_uniform_transfer_read
1395//  CHECK-PROP-SAME: (%{{.+}}: index, %[[SRC:.+]]: memref<4096xf32>, %[[INDEX:.+]]: index)
1396//       CHECK-PROP:   %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[INDEX]]], %cst {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
1397//       CHECK-PROP:   return %[[READ]] : vector<1xf32>
1398
1399// -----
1400
1401func.func @warp_propagate_multi_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>, vector<1xf32>) {
1402  %f0 = arith.constant 0.000000e+00 : f32
1403  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) {
1404    %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
1405    "some_use"(%0) : (vector<1xf32>) -> ()
1406    %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
1407    gpu.yield %0, %1 : vector<1xf32>, vector<1xf32>
1408  }
1409  return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
1410}
1411
1412// CHECK-PROP-LABEL: func.func @warp_propagate_multi_transfer_read
1413//       CHECK-PROP:   gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
1414//       CHECK-PROP:     %[[INNER_READ:.+]] = vector.transfer_read
1415//       CHECK-PROP:     "some_use"(%[[INNER_READ]])
1416//       CHECK-PROP:     gpu.yield %[[INNER_READ]] : vector<1xf32>
1417//       CHECK-PROP:   vector.transfer_read
1418
1419// -----
1420
1421func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>) {
1422  %f0 = arith.constant 0.000000e+00 : f32
1423  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
1424    %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32>
1425    %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32>
1426    %max = arith.maximumf %0, %1 : vector<64xf32>
1427    gpu.yield %max : vector<64xf32>
1428  }
1429  return %r : vector<1xf32>
1430}
1431
1432//   CHECK-PROP-LABEL: func.func @warp_propagate_dead_user_multi_read
1433// CHECK-PROP-COUNT-2:   vector.transfer_read {{.*}} vector<1xf32>
1434//         CHECK-PROP:   arith.maximumf {{.*}} : vector<1xf32>
1435
1436// -----
1437
1438func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>) {
1439  %c0 = arith.constant 0 : index
1440  gpu.warp_execute_on_lane_0(%laneid)[32] -> () {
1441    %mask = "mask_def_0"() : () -> (vector<4096xi1>)
1442    %mask2 = "mask_def_1"() : () -> (vector<32xi1>)
1443    %0 = "some_def_0"() : () -> (vector<4096xf32>)
1444    %1 = "some_def_1"() : () -> (vector<32xf32>)
1445    vector.transfer_write %0, %dest[%c0], %mask : vector<4096xf32>, memref<4096xf32>
1446    vector.transfer_write %1, %dest[%c0], %mask2 : vector<32xf32>, memref<4096xf32>
1447    gpu.yield
1448  }
1449  return
1450}
1451
1452// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_masked_write(
1453//       CHECK-DIST-AND-PROP:   %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) {
1454//       CHECK-DIST-AND-PROP:     %[[M0:.*]] = "mask_def_0"
1455//       CHECK-DIST-AND-PROP:     %[[M1:.*]] = "mask_def_1"
1456//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def_0"
1457//       CHECK-DIST-AND-PROP:     %[[V1:.*]] = "some_def_1"
1458//       CHECK-DIST-AND-PROP:     gpu.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]]
1459//  CHECK-DIST-AND-PROP-SAME:       vector<32xf32>, vector<32xi1>, vector<4096xf32>, vector<4096xi1>
1460//       CHECK-DIST-AND-PROP:   }
1461//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]]#2, {{.*}}, %[[W]]#3 {in_bounds = [true]} : vector<128xf32>, memref<4096xf32>
1462//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]]#0, {{.*}}, %[[W]]#1 {in_bounds = [true]} : vector<1xf32>, memref<4096xf32>
1463
1464// -----
1465
1466func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> (vector<2xf32>, vector<2x2xf32>) {
1467  %f0 = arith.constant 0.000000e+00 : f32
1468  %c0 = arith.constant 0 : index
1469  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) {
1470    %mask = "mask_def_0"() : () -> (vector<128xi1>)
1471    %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
1472    %mask2 = "mask_def_1"() : () -> (vector<128x2xi1>)
1473    %1 = vector.transfer_read %src[%c0, %index], %f0, %mask2 {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<128x2xf32>
1474    gpu.yield %0, %1 : vector<128xf32>, vector<128x2xf32>
1475  }
1476  return %r#0, %r#1 : vector<2xf32>, vector<2x2xf32>
1477}
1478
1479//   CHECK-PROP-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 * 2)>
1480//   CHECK-PROP-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 2)>
1481// CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read
1482//  CHECK-PROP-SAME:   %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index
1483//       CHECK-PROP:   %[[C0:.*]] = arith.constant 0 : index
1484//       CHECK-PROP:   %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) {
1485//       CHECK-PROP:     %[[M0:.*]] = "mask_def_0"
1486//       CHECK-PROP:     %[[M1:.*]] = "mask_def_1"
1487//       CHECK-PROP:     gpu.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1>
1488//       CHECK-PROP:   }
1489//       CHECK-PROP:   %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
1490//       CHECK-PROP:   vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[ARG2]]], {{.*}}, %[[R]]#1 {{.*}} vector<2x2xf32>
1491//       CHECK-PROP:   %[[DIST_READ_IDX1:.+]] = affine.apply #[[$MAP1]]()[%[[ARG2]], %[[ARG0]]]
1492//       CHECK-PROP:   vector.transfer_read {{.*}}[%[[C0]], %[[DIST_READ_IDX1]]], {{.*}}, %[[R]]#0 {{.*}} vector<2xf32>
1493
1494// -----
1495
1496func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> vector<2xf32> {
1497  %f0 = arith.constant 0.000000e+00 : f32
1498  %c0 = arith.constant 0 : index
1499  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) {
1500    %mask = "mask_def_0"() : () -> (vector<128xi1>)
1501    %0 = vector.transfer_read %src[%index, %c0], %f0, %mask {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<4096x4096xf32>, vector<128xf32>
1502    gpu.yield %0 : vector<128xf32>
1503  }
1504  return %r : vector<2xf32>
1505}
1506
1507//   CHECK-PROP-DAG: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 2)>
1508//   CHECK-PROP-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0)>
1509// CHECK-PROP-LABEL: func.func @warp_propagate_nontrivial_map_masked_transfer_read
1510//  CHECK-PROP-SAME:   %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index
1511//       CHECK-PROP:   %[[C0:.*]] = arith.constant 0 : index
1512//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) {
1513//       CHECK-PROP:     %[[M0:.*]] = "mask_def_0"
1514//       CHECK-PROP:     gpu.yield %[[M0]] : vector<128xi1>
1515//       CHECK-PROP:   }
1516//       CHECK-PROP:   %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG0]]]
1517//       CHECK-PROP:   vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[C0]]], {{.*}}, %[[R]]
1518//  CHECK-PROP-SAME:   permutation_map = #[[$MAP1]]} {{.*}} vector<2xf32>
1519
1520// -----
1521
1522func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src: memref<4096x4096xf32>, %index: index, %index2: index, %mask_ub: index) -> (vector<2xf32>, vector<2xf32>) {
1523  %f0 = arith.constant 0.000000e+00 : f32
1524  %c0 = arith.constant 0 : index
1525  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) {
1526    %mask = vector.create_mask %mask_ub: vector<128xi1>
1527    %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
1528    %1 = vector.transfer_read %src[%c0, %index2], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
1529    gpu.yield %0, %1 : vector<128xf32>, vector<128xf32>
1530  }
1531  return %r#0, %r#1 : vector<2xf32>, vector<2xf32>
1532}
1533
1534// CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read_shared_mask
1535//       CHECK-PROP:   vector.create_mask %{{.*}} : vector<2xi1>
1536//       CHECK-PROP:   vector.transfer_read %{{.*}} : memref<4096x4096xf32>, vector<2xf32>
1537//       CHECK-PROP:   vector.create_mask %{{.*}} : vector<2xi1>
1538//       CHECK-PROP:   vector.transfer_read %{{.*}} : memref<4096x4096xf32>, vector<2xf32>
1539
1540// -----
1541
1542func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref<128xf32>, %f1: f32) -> (vector<2xf32>, vector<4xf32>) {
1543  %f0 = arith.constant 0.000000e+00 : f32
1544  %c0 = arith.constant 0 : index
1545  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) {
1546    %cst = arith.constant dense<2.0> : vector<128xf32>
1547    %0 = vector.transfer_read %buffer[%c0], %f0 {in_bounds = [true]} : memref<128xf32>, vector<128xf32>
1548    vector.transfer_write %cst, %buffer[%c0] : vector<128xf32>, memref<128xf32>
1549    %1 = vector.broadcast %f1 : f32 to vector<64xf32>
1550    gpu.yield %1, %0 : vector<64xf32>, vector<128xf32>
1551  }
1552  return %r#0, %r#1 : vector<2xf32>, vector<4xf32>
1553}
1554
1555// Verify that the write comes after the read
1556// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_unconnected_read_write(
1557//       CHECK-DIST-AND-PROP:   %[[CST:.+]] = arith.constant dense<2.000000e+00> : vector<4xf32>
1558//       CHECK-DIST-AND-PROP:   vector.transfer_read {{.*}} : memref<128xf32>, vector<4xf32>
1559//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[CST]], {{.*}} : vector<4xf32>, memref<128xf32>
1560
1561// -----
1562
1563func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1> {
1564  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) {
1565    %1 = vector.create_mask %m0 : vector<32xi1>
1566    gpu.yield %1 : vector<32xi1>
1567  }
1568  return %r : vector<1xi1>
1569}
1570
1571//   CHECK-PROP-DAG: #[[$SUB:.*]] = affine_map<()[s0, s1] -> (-s0 + s1)>
1572// CHECK-PROP-LABEL: func @warp_propagate_create_mask
1573//  CHECK-PROP-SAME: %[[LANEID:.+]]: index, %[[M0:.+]]: index
1574//       CHECK-PROP:   %[[MDIST:.+]] = affine.apply #[[$SUB]]()[%[[LANEID]], %[[M0]]]
1575//       CHECK-PROP:   vector.create_mask %[[MDIST]] : vector<1xi1>
1576
1577// -----
1578
1579func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1: index, %m2: index) -> vector<1x2x4xi1> {
1580  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) {
1581    %1 = vector.create_mask %m0, %m1, %m2 : vector<16x4x4xi1>
1582    gpu.yield %1 : vector<16x4x4xi1>
1583  }
1584  return %r : vector<1x2x4xi1>
1585}
1586
1587//   CHECK-PROP-DAG: #[[$SUBM0:.*]] = affine_map<()[s0, s1] -> (s0 - s1 floordiv 2)>
1588//   CHECK-PROP-DAG: #[[$SUBM1:.*]] = affine_map<()[s0, s1] -> (s0 - s1 * 2 + (s1 floordiv 2) * 4)>
1589// CHECK-PROP-LABEL: func @warp_propagate_multi_dim_create_mask
1590//  CHECK-PROP-SAME: %[[LANEID:.+]]: index, %[[M0:.+]]: index, %[[M1:.+]]: index, %[[M2:.+]]: index
1591//       CHECK-PROP:   %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]]
1592//       CHECK-PROP:   %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]]
1593//       CHECK-PROP:   vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1>
1594
1595// -----
1596
1597func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
1598  %c0 = arith.constant 0 : index
1599  gpu.warp_execute_on_lane_0(%laneid)[32] -> () {
1600    %0 = "some_def"() : () -> (vector<4x1024xf32>)
1601    vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
1602    gpu.yield
1603  }
1604  return
1605}
1606
1607//       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
1608
1609// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
1610//       CHECK-DIST-AND-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
1611//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
1612//       CHECK-DIST-AND-PROP:     gpu.yield %[[V0]]
1613//  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
1614//       CHECK-DIST-AND-PROP:   }
1615
1616//       CHECK-DIST-AND-PROP:   %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (4, 8) : index, index
1617//       CHECK-DIST-AND-PROP:   %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1]
1618//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32>
1619