xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vselect.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s
3;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s
4;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s
5
6define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
7; SI-LABEL: test_select_v2i32:
8; SI:       ; %bb.0: ; %entry
9; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
10; SI-NEXT:    s_waitcnt lgkmcnt(0)
11; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
12; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
13; SI-NEXT:    s_mov_b32 s3, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_cmp_gt_i32 s9, s5
16; SI-NEXT:    s_cselect_b32 s5, s7, s9
17; SI-NEXT:    s_cmp_gt_i32 s8, s4
18; SI-NEXT:    s_cselect_b32 s4, s6, s8
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    v_mov_b32_e32 v1, s5
21; SI-NEXT:    v_mov_b32_e32 v0, s4
22; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
23; SI-NEXT:    s_endpgm
24;
25; VI-LABEL: test_select_v2i32:
26; VI:       ; %bb.0: ; %entry
27; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
28; VI-NEXT:    s_waitcnt lgkmcnt(0)
29; VI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
30; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
31; VI-NEXT:    s_mov_b32 s3, 0xf000
32; VI-NEXT:    s_mov_b32 s2, -1
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_cmp_gt_i32 s9, s5
35; VI-NEXT:    s_cselect_b32 s5, s7, s9
36; VI-NEXT:    s_cmp_gt_i32 s8, s4
37; VI-NEXT:    s_cselect_b32 s4, s6, s8
38; VI-NEXT:    v_mov_b32_e32 v0, s4
39; VI-NEXT:    v_mov_b32_e32 v1, s5
40; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
41; VI-NEXT:    s_endpgm
42;
43; EG-LABEL: test_select_v2i32:
44; EG:       ; %bb.0: ; %entry
45; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
46; EG-NEXT:    TEX 1 @6
47; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
48; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
49; EG-NEXT:    CF_END
50; EG-NEXT:    PAD
51; EG-NEXT:    Fetch clause starting at 6:
52; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
53; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
54; EG-NEXT:    ALU clause starting at 10:
55; EG-NEXT:     MOV T0.X, KC0[2].Z,
56; EG-NEXT:     MOV * T1.X, KC0[2].W,
57; EG-NEXT:    ALU clause starting at 12:
58; EG-NEXT:     SETGT_INT * T0.W, T0.Y, T1.Y,
59; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
60; EG-NEXT:     SETGT_INT * T0.W, T0.X, T1.X,
61; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
62; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
63; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
64entry:
65  %load0 = load <2 x i32>, ptr addrspace(1) %in0
66  %load1 = load <2 x i32>, ptr addrspace(1) %in1
67  %cmp = icmp sgt <2 x i32> %load0, %load1
68  %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0
69  store <2 x i32> %result, ptr addrspace(1) %out
70  ret void
71}
72
73define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
74; SI-LABEL: test_select_v2f32:
75; SI:       ; %bb.0: ; %entry
76; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
77; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
80; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
81; SI-NEXT:    s_mov_b32 s3, 0xf000
82; SI-NEXT:    s_mov_b32 s2, -1
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    v_mov_b32_e32 v0, s4
85; SI-NEXT:    v_mov_b32_e32 v1, s5
86; SI-NEXT:    v_mov_b32_e32 v2, s7
87; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s7, v1
88; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
89; SI-NEXT:    v_mov_b32_e32 v2, s6
90; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s6, v0
91; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
92; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
93; SI-NEXT:    s_endpgm
94;
95; VI-LABEL: test_select_v2f32:
96; VI:       ; %bb.0: ; %entry
97; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
98; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
99; VI-NEXT:    s_mov_b32 s7, 0xf000
100; VI-NEXT:    s_mov_b32 s6, -1
101; VI-NEXT:    s_waitcnt lgkmcnt(0)
102; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
103; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
104; VI-NEXT:    s_mov_b32 s4, s0
105; VI-NEXT:    s_mov_b32 s5, s1
106; VI-NEXT:    s_waitcnt lgkmcnt(0)
107; VI-NEXT:    v_mov_b32_e32 v1, s9
108; VI-NEXT:    v_mov_b32_e32 v0, s8
109; VI-NEXT:    v_mov_b32_e32 v2, s3
110; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v1
111; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
112; VI-NEXT:    v_mov_b32_e32 v2, s2
113; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v0
114; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
115; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
116; VI-NEXT:    s_endpgm
117;
118; EG-LABEL: test_select_v2f32:
119; EG:       ; %bb.0: ; %entry
120; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    TEX 1 @6
122; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
124; EG-NEXT:    CF_END
125; EG-NEXT:    PAD
126; EG-NEXT:    Fetch clause starting at 6:
127; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
128; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
129; EG-NEXT:    ALU clause starting at 10:
130; EG-NEXT:     MOV T0.X, KC0[2].Z,
131; EG-NEXT:     MOV * T1.X, KC0[2].W,
132; EG-NEXT:    ALU clause starting at 12:
133; EG-NEXT:     SETNE_DX10 * T0.W, T0.Y, T1.Y,
134; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
135; EG-NEXT:     SETNE_DX10 * T0.W, T0.X, T1.X,
136; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
137; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
138; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
139entry:
140  %0 = load <2 x float>, ptr addrspace(1) %in0
141  %1 = load <2 x float>, ptr addrspace(1) %in1
142  %cmp = fcmp une <2 x float> %0, %1
143  %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
144  store <2 x float> %result, ptr addrspace(1) %out
145  ret void
146}
147
148define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) {
149; SI-LABEL: test_select_v4i32:
150; SI:       ; %bb.0: ; %entry
151; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
152; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
153; SI-NEXT:    s_waitcnt lgkmcnt(0)
154; SI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
155; SI-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x0
156; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x11
157; SI-NEXT:    s_mov_b32 s3, 0xf000
158; SI-NEXT:    s_waitcnt lgkmcnt(0)
159; SI-NEXT:    s_cmp_gt_i32 s10, s14
160; SI-NEXT:    s_cselect_b32 s6, s6, s10
161; SI-NEXT:    s_cmp_gt_i32 s9, s13
162; SI-NEXT:    s_cselect_b32 s5, s5, s9
163; SI-NEXT:    s_cmp_gt_i32 s11, s15
164; SI-NEXT:    s_cselect_b32 s7, s7, s11
165; SI-NEXT:    s_cmp_gt_i32 s8, s12
166; SI-NEXT:    s_cselect_b32 s4, s4, s8
167; SI-NEXT:    s_mov_b32 s2, -1
168; SI-NEXT:    v_mov_b32_e32 v2, s6
169; SI-NEXT:    v_mov_b32_e32 v1, s5
170; SI-NEXT:    v_mov_b32_e32 v3, s7
171; SI-NEXT:    v_mov_b32_e32 v0, s4
172; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
173; SI-NEXT:    s_endpgm
174;
175; VI-LABEL: test_select_v4i32:
176; VI:       ; %bb.0: ; %entry
177; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
178; VI-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
179; VI-NEXT:    s_mov_b32 s7, 0xf000
180; VI-NEXT:    s_mov_b32 s6, -1
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
183; VI-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
184; VI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x44
185; VI-NEXT:    s_mov_b32 s4, s0
186; VI-NEXT:    s_mov_b32 s5, s1
187; VI-NEXT:    s_waitcnt lgkmcnt(0)
188; VI-NEXT:    s_cmp_gt_i32 s10, s14
189; VI-NEXT:    s_cselect_b32 s0, s18, s10
190; VI-NEXT:    s_cmp_gt_i32 s9, s13
191; VI-NEXT:    s_cselect_b32 s1, s17, s9
192; VI-NEXT:    s_cmp_gt_i32 s11, s15
193; VI-NEXT:    s_cselect_b32 s2, s19, s11
194; VI-NEXT:    s_cmp_gt_i32 s8, s12
195; VI-NEXT:    s_cselect_b32 s3, s16, s8
196; VI-NEXT:    v_mov_b32_e32 v0, s3
197; VI-NEXT:    v_mov_b32_e32 v1, s1
198; VI-NEXT:    v_mov_b32_e32 v2, s0
199; VI-NEXT:    v_mov_b32_e32 v3, s2
200; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
201; VI-NEXT:    s_endpgm
202;
203; EG-LABEL: test_select_v4i32:
204; EG:       ; %bb.0: ; %entry
205; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
206; EG-NEXT:    TEX 1 @6
207; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
208; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
209; EG-NEXT:    CF_END
210; EG-NEXT:    PAD
211; EG-NEXT:    Fetch clause starting at 6:
212; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
213; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
214; EG-NEXT:    ALU clause starting at 10:
215; EG-NEXT:     MOV T0.X, KC0[2].Z,
216; EG-NEXT:     MOV * T1.X, KC0[2].W,
217; EG-NEXT:    ALU clause starting at 12:
218; EG-NEXT:     SETGT_INT T1.W, T0.W, T1.W,
219; EG-NEXT:     SETGT_INT * T2.W, T0.Z, T1.Z,
220; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, KC0[4].X,
221; EG-NEXT:     CNDE_INT T0.Z, T2.W, T0.Z, KC0[3].W,
222; EG-NEXT:     SETGT_INT * T1.W, T0.Y, T1.Y,
223; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
224; EG-NEXT:     SETGT_INT * T1.W, T0.X, T1.X,
225; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
226; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
227; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
228entry:
229  %load0 = load <4 x i32>, ptr addrspace(1) %in0
230  %load1 = load <4 x i32>, ptr addrspace(1) %in1
231  %cmp = icmp sgt <4 x i32> %load0, %load1
232  %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0
233  store <4 x i32> %result, ptr addrspace(1) %out
234  ret void
235}
236
237define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
238; SI-LABEL: test_select_v4f32:
239; SI:       ; %bb.0: ; %entry
240; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
241; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
242; SI-NEXT:    s_waitcnt lgkmcnt(0)
243; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
244; SI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
245; SI-NEXT:    s_mov_b32 s3, 0xf000
246; SI-NEXT:    s_mov_b32 s2, -1
247; SI-NEXT:    s_waitcnt lgkmcnt(0)
248; SI-NEXT:    v_mov_b32_e32 v0, s8
249; SI-NEXT:    v_mov_b32_e32 v1, s9
250; SI-NEXT:    v_mov_b32_e32 v2, s10
251; SI-NEXT:    v_mov_b32_e32 v3, s11
252; SI-NEXT:    v_mov_b32_e32 v4, s7
253; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s7, v3
254; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
255; SI-NEXT:    v_mov_b32_e32 v4, s6
256; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s6, v2
257; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
258; SI-NEXT:    v_mov_b32_e32 v4, s5
259; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s5, v1
260; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
261; SI-NEXT:    v_mov_b32_e32 v4, s4
262; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
263; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
264; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
265; SI-NEXT:    s_endpgm
266;
267; VI-LABEL: test_select_v4f32:
268; VI:       ; %bb.0: ; %entry
269; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
270; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
271; VI-NEXT:    s_mov_b32 s7, 0xf000
272; VI-NEXT:    s_mov_b32 s6, -1
273; VI-NEXT:    s_waitcnt lgkmcnt(0)
274; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
275; VI-NEXT:    s_mov_b32 s4, s0
276; VI-NEXT:    s_mov_b32 s5, s1
277; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
278; VI-NEXT:    s_waitcnt lgkmcnt(0)
279; VI-NEXT:    v_mov_b32_e32 v3, s11
280; VI-NEXT:    v_mov_b32_e32 v2, s10
281; VI-NEXT:    v_mov_b32_e32 v1, s9
282; VI-NEXT:    v_mov_b32_e32 v4, s3
283; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v3
284; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
285; VI-NEXT:    v_mov_b32_e32 v4, s2
286; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v2
287; VI-NEXT:    v_mov_b32_e32 v0, s8
288; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
289; VI-NEXT:    v_mov_b32_e32 v4, s1
290; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s1, v1
291; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
292; VI-NEXT:    v_mov_b32_e32 v4, s0
293; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s0, v0
294; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
295; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
296; VI-NEXT:    s_endpgm
297;
298; EG-LABEL: test_select_v4f32:
299; EG:       ; %bb.0: ; %entry
300; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
301; EG-NEXT:    TEX 1 @6
302; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
303; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
304; EG-NEXT:    CF_END
305; EG-NEXT:    PAD
306; EG-NEXT:    Fetch clause starting at 6:
307; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
308; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
309; EG-NEXT:    ALU clause starting at 10:
310; EG-NEXT:     MOV T0.X, KC0[2].Z,
311; EG-NEXT:     MOV * T1.X, KC0[2].W,
312; EG-NEXT:    ALU clause starting at 12:
313; EG-NEXT:     SETNE_DX10 T2.W, T0.W, T1.W,
314; EG-NEXT:     SETNE_DX10 * T3.W, T0.Z, T1.Z,
315; EG-NEXT:     CNDE_INT * T0.W, PV.W, T1.W, T0.W,
316; EG-NEXT:     CNDE_INT T0.Z, T3.W, T1.Z, T0.Z,
317; EG-NEXT:     SETNE_DX10 * T1.W, T0.Y, T1.Y,
318; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
319; EG-NEXT:     SETNE_DX10 * T1.W, T0.X, T1.X,
320; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
321; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
322; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
323entry:
324  %0 = load <4 x float>, ptr addrspace(1) %in0
325  %1 = load <4 x float>, ptr addrspace(1) %in1
326  %cmp = fcmp une <4 x float> %0, %1
327  %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
328  store <4 x float> %result, ptr addrspace(1) %out
329  ret void
330}
331