xref: /llvm-project/llvm/test/CodeGen/AMDGPU/select64.ll (revision 085f078307bac264301b07f6e47e2a04e90a6f1d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=VI %s
4; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefix=GFX90A %s
5
6define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
7; SI-LABEL: select0:
8; SI:       ; %bb.0: ; %entry
9; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
10; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
11; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s3, 0xf000
13; SI-NEXT:    s_mov_b32 s2, -1
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_cmp_lt_u32 s6, 6
16; SI-NEXT:    v_mov_b32_e32 v0, s5
17; SI-NEXT:    s_cselect_b64 vcc, -1, 0
18; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
19; SI-NEXT:    v_mov_b32_e32 v0, s4
20; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
21; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
22; SI-NEXT:    s_endpgm
23;
24; VI-LABEL: select0:
25; VI:       ; %bb.0: ; %entry
26; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
27; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
28; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    s_cmp_lt_u32 s4, 6
31; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
32; VI-NEXT:    v_mov_b32_e32 v0, s0
33; VI-NEXT:    v_mov_b32_e32 v2, s2
34; VI-NEXT:    v_mov_b32_e32 v1, s1
35; VI-NEXT:    v_mov_b32_e32 v3, s3
36; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
37; VI-NEXT:    s_endpgm
38;
39; GFX90A-LABEL: select0:
40; GFX90A:       ; %bb.0: ; %entry
41; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
42; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
43; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
44; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
45; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX90A-NEXT:    s_cmp_lt_u32 s6, 6
47; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
48; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
49; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
50; GFX90A-NEXT:    s_endpgm
51entry:
52  %0 = icmp ugt i32 %cond, 5
53  %1 = select i1 %0, i64 0, i64 %in
54  store i64 %1, i64 addrspace(1)* %out
55  ret void
56}
57
58define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
59; SI-LABEL: select_trunc_i64:
60; SI:       ; %bb.0:
61; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
62; SI-NEXT:    s_load_dword s5, s[0:1], 0xd
63; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
64; SI-NEXT:    s_mov_b32 s3, 0xf000
65; SI-NEXT:    s_mov_b32 s2, -1
66; SI-NEXT:    s_waitcnt lgkmcnt(0)
67; SI-NEXT:    s_cmp_lt_u32 s4, 6
68; SI-NEXT:    v_mov_b32_e32 v0, s5
69; SI-NEXT:    s_cselect_b64 vcc, -1, 0
70; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
71; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
72; SI-NEXT:    s_endpgm
73;
74; VI-LABEL: select_trunc_i64:
75; VI:       ; %bb.0:
76; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
77; VI-NEXT:    s_load_dword s3, s[0:1], 0x34
78; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
79; VI-NEXT:    s_waitcnt lgkmcnt(0)
80; VI-NEXT:    s_cmp_lt_u32 s2, 6
81; VI-NEXT:    s_cselect_b32 s2, s3, 0
82; VI-NEXT:    v_mov_b32_e32 v0, s0
83; VI-NEXT:    v_mov_b32_e32 v1, s1
84; VI-NEXT:    v_mov_b32_e32 v2, s2
85; VI-NEXT:    flat_store_dword v[0:1], v2
86; VI-NEXT:    s_endpgm
87;
88; GFX90A-LABEL: select_trunc_i64:
89; GFX90A:       ; %bb.0:
90; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
91; GFX90A-NEXT:    s_load_dword s5, s[0:1], 0x34
92; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
93; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
94; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX90A-NEXT:    s_cmp_lt_u32 s4, 6
96; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
97; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
98; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
99; GFX90A-NEXT:    s_endpgm
100  %cmp = icmp ugt i32 %cond, 5
101  %sel = select i1 %cmp, i64 0, i64 %in
102  %trunc = trunc i64 %sel to i32
103  store i32 %trunc, i32 addrspace(1)* %out, align 4
104  ret void
105}
106
107define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
108; SI-LABEL: select_trunc_i64_2:
109; SI:       ; %bb.0:
110; SI-NEXT:    s_load_dword s8, s[0:1], 0xb
111; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
112; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
113; SI-NEXT:    s_mov_b32 s3, 0xf000
114; SI-NEXT:    s_mov_b32 s2, -1
115; SI-NEXT:    s_waitcnt lgkmcnt(0)
116; SI-NEXT:    s_cmp_gt_u32 s8, 5
117; SI-NEXT:    v_mov_b32_e32 v0, s6
118; SI-NEXT:    v_mov_b32_e32 v1, s4
119; SI-NEXT:    s_cselect_b64 vcc, -1, 0
120; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
121; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
122; SI-NEXT:    s_endpgm
123;
124; VI-LABEL: select_trunc_i64_2:
125; VI:       ; %bb.0:
126; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
127; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
128; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
129; VI-NEXT:    s_waitcnt lgkmcnt(0)
130; VI-NEXT:    s_cmp_gt_u32 s2, 5
131; VI-NEXT:    s_cselect_b32 s2, s4, s6
132; VI-NEXT:    v_mov_b32_e32 v0, s0
133; VI-NEXT:    v_mov_b32_e32 v1, s1
134; VI-NEXT:    v_mov_b32_e32 v2, s2
135; VI-NEXT:    flat_store_dword v[0:1], v2
136; VI-NEXT:    s_endpgm
137;
138; GFX90A-LABEL: select_trunc_i64_2:
139; GFX90A:       ; %bb.0:
140; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
141; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
142; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
143; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
144; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
146; GFX90A-NEXT:    s_cselect_b32 s0, s4, s6
147; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
148; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
149; GFX90A-NEXT:    s_endpgm
150  %cmp = icmp ugt i32 %cond, 5
151  %sel = select i1 %cmp, i64 %a, i64 %b
152  %trunc = trunc i64 %sel to i32
153  store i32 %trunc, i32 addrspace(1)* %out, align 4
154  ret void
155}
156
157define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
158; SI-LABEL: v_select_trunc_i64_2:
159; SI:       ; %bb.0:
160; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
161; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
162; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
163; SI-NEXT:    s_mov_b32 s11, 0xf000
164; SI-NEXT:    s_mov_b32 s10, -1
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
167; SI-NEXT:    s_load_dword s2, s[4:5], 0x0
168; SI-NEXT:    s_cmp_gt_u32 s0, 5
169; SI-NEXT:    s_cselect_b64 vcc, -1, 0
170; SI-NEXT:    s_waitcnt lgkmcnt(0)
171; SI-NEXT:    v_mov_b32_e32 v0, s1
172; SI-NEXT:    v_mov_b32_e32 v1, s2
173; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
174; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
175; SI-NEXT:    s_endpgm
176;
177; VI-LABEL: v_select_trunc_i64_2:
178; VI:       ; %bb.0:
179; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
180; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
183; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
184; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
185; VI-NEXT:    s_cmp_gt_u32 s2, 5
186; VI-NEXT:    s_waitcnt lgkmcnt(0)
187; VI-NEXT:    s_cselect_b32 s2, s3, s4
188; VI-NEXT:    v_mov_b32_e32 v0, s0
189; VI-NEXT:    v_mov_b32_e32 v1, s1
190; VI-NEXT:    v_mov_b32_e32 v2, s2
191; VI-NEXT:    flat_store_dword v[0:1], v2
192; VI-NEXT:    s_endpgm
193;
194; GFX90A-LABEL: v_select_trunc_i64_2:
195; GFX90A:       ; %bb.0:
196; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
197; GFX90A-NEXT:    s_load_dword s8, s[0:1], 0x2c
198; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
199; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX90A-NEXT:    s_load_dword s9, s[4:5], 0x0
201; GFX90A-NEXT:    s_load_dword s10, s[6:7], 0x0
202; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
203; GFX90A-NEXT:    s_cmp_gt_u32 s8, 5
204; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX90A-NEXT:    s_cselect_b32 s0, s9, s10
206; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
207; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
208; GFX90A-NEXT:    s_endpgm
209  %cmp = icmp ugt i32 %cond, 5
210  %a = load i64, i64 addrspace(1)* %aptr, align 8
211  %b = load i64, i64 addrspace(1)* %bptr, align 8
212  %sel = select i1 %cmp, i64 %a, i64 %b
213  %trunc = trunc i64 %sel to i32
214  store i32 %trunc, i32 addrspace(1)* %out, align 4
215  ret void
216}
217
218define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
219; SI-LABEL: v_select_i64_split_imm:
220; SI:       ; %bb.0:
221; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
222; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
223; SI-NEXT:    s_waitcnt lgkmcnt(0)
224; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
225; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
226; SI-NEXT:    s_cmp_gt_u32 s6, 5
227; SI-NEXT:    s_cselect_b64 vcc, -1, 0
228; SI-NEXT:    s_mov_b32 s3, 0xf000
229; SI-NEXT:    s_waitcnt lgkmcnt(0)
230; SI-NEXT:    v_mov_b32_e32 v0, s5
231; SI-NEXT:    v_mov_b32_e32 v2, s4
232; SI-NEXT:    s_mov_b32 s2, -1
233; SI-NEXT:    v_cndmask_b32_e32 v1, 63, v0, vcc
234; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
235; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
236; SI-NEXT:    s_endpgm
237;
238; VI-LABEL: v_select_i64_split_imm:
239; VI:       ; %bb.0:
240; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
241; VI-NEXT:    s_load_dword s6, s[0:1], 0x2c
242; VI-NEXT:    s_mov_b32 s4, 0
243; VI-NEXT:    s_mov_b32 s5, 63
244; VI-NEXT:    s_waitcnt lgkmcnt(0)
245; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
246; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
247; VI-NEXT:    s_cmp_gt_u32 s6, 5
248; VI-NEXT:    s_waitcnt lgkmcnt(0)
249; VI-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
250; VI-NEXT:    v_mov_b32_e32 v0, s0
251; VI-NEXT:    v_mov_b32_e32 v2, s2
252; VI-NEXT:    v_mov_b32_e32 v1, s1
253; VI-NEXT:    v_mov_b32_e32 v3, s3
254; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
255; VI-NEXT:    s_endpgm
256;
257; GFX90A-LABEL: v_select_i64_split_imm:
258; GFX90A:       ; %bb.0:
259; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
260; GFX90A-NEXT:    s_load_dword s6, s[0:1], 0x2c
261; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
262; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
263; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
265; GFX90A-NEXT:    s_mov_b32 s2, 0
266; GFX90A-NEXT:    s_cmp_gt_u32 s6, 5
267; GFX90A-NEXT:    s_mov_b32 s3, 63
268; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
269; GFX90A-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
270; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
271; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
272; GFX90A-NEXT:    s_endpgm
273  %cmp = icmp ugt i32 %cond, 5
274  %a = load i64, i64 addrspace(1)* %aptr, align 8
275  %b = load i64, i64 addrspace(1)* %bptr, align 8
276  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
277  store i64 %sel, i64 addrspace(1)* %out, align 8
278  ret void
279}
280